Beispiel #1
0
def seekRank(df, mode):
    # inchi.inchikey_from_inchi
    # usage : the same way in html_report.generate_candidate_html
    assert mode in {0, 1}
    lst = []
    for index, row in df.iterrows():
        target_inchikey = ''
        if pd.isnull(row['parameters.inchi']) | \
                len(str(row['parameters.inchi']).strip()) == 0:
            if 'parameters.inchikey' not in row.index:
                lst.append(
                    (row['parameters.db_molecule_name'], 'No inchi/inchikey'))
                continue
            else:
                target_inchikey = row['parameters.inchikey']
        else:
            inchi = 'InChI=1S/%s' % row['parameters.inchi']
            target_inchikey = inchikey_from_inchi(inchi)
        content = row['peaks'][0]
        annotations = []
        if 'annotations' in content.keys():
            annotations = content['annotations'][0]
        else:
            if mode == 1:
                lst.append(
                    (row['parameters.db_molecule_name'], 'No annotations'))
            elif mode == 0:
                lst.append((target_inchikey, 'No annotations'))
            continue
        candidates = annotations['mol_candidates']
        indice = 0
        i = 0
        for c in candidates:
            inchi = 'InChI=1S/%s' % c['InChI']
            inchikey = inchikey_from_inchi(inchi)
            i += 1
            if target_inchikey == inchikey:
                indice = i
                break
        if mode == 1:
            lst.append((row['parameters.db_molecule_name'], indice))
        elif mode == 0:
            lst.append((target_inchikey, indice))
    return lst
def save_inchi(peaklist, outdir, mode):

    if mode == 1:
        outpath = '%s/positive' % outdir
    else:
        outpath = '%s/negative' % outdir

    if not os.path.exists(outpath):
        os.makedirs(outpath)

    print(outpath)
    for i in range(len(peaklist)):
        peak = peaklist[i]
        with open('%s/%s.inchi' % (outpath, i), 'w') as fout:
            inchi = peak.parent_spectrum.parameters['inchi']
            inchikey = inchikey_from_inchi(inchi)
            fout.write('%s\t%s\n' % (inchi, inchikey))
Beispiel #3
0
    def _get_next_raw_record(self):
        if self.currentfile=='':
            self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            while (not os.path.isfile(self.currentfile)) and (self.mzcurrent<=self.mzmax_int):
                self.mzcurrent+=1;
                self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            if self.mzcurrent<=self.mzmax_int:
                self.datafile=open(self.currentfile,'r');
                self.record_index=-1;
            else:
                raise StopIteration();                
        
        s=self.datafile.readline();
        self.record_index+=1;
        while s=='':
            self.datafile.close();
            self.mzcurrent+=1;
            self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            while (not os.path.isfile(self.currentfile)) and (self.mzcurrent<=self.mzmax_int):
                self.mzcurrent+=1;
                self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            if self.mzcurrent<=self.mzmax_int:
                self.datafile=open(self.currentfile,'r');
                self.record_index=-1;
            else:
                raise StopIteration();                
            s=self.datafile.readline();
            self.record_index+=1;
            
        s=s.rstrip('\n').split('\t');
        record=MolecularRecord();
        record['MZ']=float(s[0]);
        if self.charged:
            record['Mass']=float(s[1]);
            record['Charge']=float(s[2]);
        else:
            record['Mass']=record['MZ'];
            record['Charge']=0;
        if 'ShortInChI' in self.required_fields:            
            record['ShortInChI']=parse_inchi(s[2+self.offs])[0];
            
        if 'InChI' in self.required_fields:
            record['InChI']=s[2+self.offs];

        if 'SMILES' in self.required_fields:    
            record['SMILES']=s[3+self.offs];

        if 'IDs' in self.required_fields:            
            record['IDs']=s[4+self.offs];

        if 'FPT' in self.required_fields:
            record['FPT']=decode_from_base64(s[5+self.offs]);
            # Mask FPT here !

        if 'Frag' in self.required_fields:
            record['Frag']=s[6+self.offs];
            if self.charged:
                record['FragCharge']=s[9];

        if 'InChIKeyValues' in self.required_fields:        
            record['InChIKeyValues']=inchikeyvalues_from_inchi(s[2+self.offs]);
            
        if 'InChIKey' in self.required_fields:        
            record['InChIKey']=inchikey_from_inchi(s[2+self.offs]);
        
        if ('Formula' in self.required_fields) or ('ElementVector' in self.required_fields) or ('FormulaVector' in self.required_fields):       
            fla=parse_formula(s[1+self.offs].split('/')[0]);
            
            if 'Formula' in self.required_fields:
                record['Formula']=fla;
        
            if 'ElementVector' in self.required_fields:
                record['ElementVector']=formula_to_element_vector(fla);
            
            if 'FormulaVector' in self.required_fields:
                record['FormulaVector']=encode_formula_to_array(fla);
        

            
        return record;
def loadresults(peaks, data_path):
    fname = data_path + '/fpt_mask.default';
    with open(fname,'rb') as finp:
        fpt_mask = np.fromfile(finp, dtype=np.uint8);
        print(fpt_mask.shape);
        fpt_mask = encode_to_base64(fpt_mask);    
        print(len(fpt_mask))
        print(fpt_mask)
        time.sleep(15)
    
    
    
    pred_test = np.loadtxt(data_path + '/pred_test.txt')
    pred_train = np.loadtxt(data_path + '/pred_train.txt')

    inchi_test = [];
    formulas_test = [];
    scores_test = [];

    inchi_train = [];
    formulas_train = [];
    scores_train = [];
    
    fname = data_path + '/train_inchi.txt';
    with open(fname,'r') as finp:
        for s in finp:
            inchi_train.append(inchikey_from_inchi(s.rstrip('\n')).split('-')[0]);
            
    fname = data_path + '/train_formulas.txt';
    with open(fname,'r') as finp:
        for s in finp:
            formulas_train.append(s.rstrip('\n'));

    fname = data_path + '/train_formula_scores.txt';
    with open(fname,'r') as finp:
        for s in finp:
            scores_train.append(float(s.rstrip('\n')));



    fname = data_path + '/test_inchi.txt';
    with open(fname,'r') as finp:
        for s in finp:
            inchi_test.append(inchikey_from_inchi(s.rstrip('\n')).split('-')[0]);
            
    fname = data_path + '/test_formulas.txt';
    with open(fname,'r') as finp:
        for s in finp:
            formulas_test.append(s.rstrip('\n'));

    fname = data_path + '/test_formula_scores.txt';
    with open(fname,'r') as finp:
        for s in finp:
            scores_test.append(float(s.rstrip('\n')));

    print(len(pred_test))
    print(len(inchi_test))
    print(len(formulas_test))
    print(len(scores_test))

    print(len(pred_train))
    print(len(inchi_train))
    print(len(formulas_train))
    print(len(scores_train))
    
    if len(pred_test)!=len(inchi_test) or len(inchi_test) != len(formulas_test) or len(formulas_test) != len(scores_test):
        raise ValueError('Unequal test sizes!');

    if len(pred_train)!=len(inchi_train) or len(inchi_train) != len(formulas_train) or len(formulas_train) != len(scores_train):
        raise ValueError('Unequal train sizes!');


    inchiindex = {};
    
    for i in range(len(peaks)):
        pi = peaks[i].parent_spectrum.parameters['inchi'];
        pi = inchikey_from_inchi(pi).split('-')[0]
        inchiindex[pi] = i;
    
    #with open(data_path + '/listinchi.txt', 'w') as fout:
    #    for key in inchiindex.keys():
    #        fout.write('%s\n'%key)
    

    inchi = inchi_test + inchi_train
    formulas = formulas_test + formulas_train
    scores = scores_test + scores_train
    pred = np.vstack((pred_test, pred_train))
    
    lost_count = 0;    
    for i in range(len(inchi)):
        if inchi[i] in inchiindex:
            j = inchiindex[inchi[i]];
            peak = peaks[j];
            if 'csifingerid_count' in peak.parameters:
                peak.parameters['csifingerid_count'] += 1;
            else:
                peak.parameters['csifingerid_count'] = 1;
            
            subi = peak.parameters['csifingerid_count'];
            p = pred[i, :].tolist();
            for k in range(len(p)):
                p[k] = str(p[k])
            peak.parameters['csifingerid_predfpt_%s'%subi] = ','.join(p);
            peak.parameters['csifingerid_fptmask_%s'%subi] = fpt_mask;
            peak.parameters['csifingerid_formula_%s'%subi] = formulas[i];
            peak.parameters['csifingerid_score_%s'%subi] = scores[i];
            
        else:
            print('Error! Inchi not found! %s'%inchi[i]);
            lost_count += 1;
    print('Total Missing: %s'%lost_count)        
    
    
    notpresent_count = len(peaks);
    for peak in peaks:
        if 'csifingerid_count' in peak.parameters:
            notpresent_count -= 1;
    
    
    return lost_count, notpresent_count
Beispiel #5
0
def generate_candidate_html(mainHTML,
                            candidate,
                            outpath,
                            index,
                            subpath,
                            merged=False):
    if 'Annotation' in candidate:
        if candidate['Annotation'] == 'Correct':
            mainHTML.write('<tr><td bgcolor="#ccffcc">\n')
        else:
            mainHTML.write('<tr><td bgcolor="#e0e0e0">\n')

    else:
        mainHTML.write('<tr><td>\n')

    if 'SMILES' in candidate:
        smiles = candidate['SMILES']

    has_image = False
    if smiles != '':
        fname = os.path.join(outpath, 'candidate_%s.png' % index)
        if generate_candidate_image(fname, smiles):
            has_image = True

    if has_image:
        mainHTML.write(
            '<table><tr><td><img src="%s/candidate_%s.png"></td><td>\n' %
            (subpath, index))

    mainHTML.write('<b>Total Score:</b> %s\n' % candidate['TotalScore'])
    mainHTML.write('<br><b>Individual Scores:</b>\n')
    for score in candidate['Scores']:
        mainHTML.write('<br><b>%s:</b> %s\n' %
                       (score, candidate['Scores'][score]))

    mainHTML.write('<br><b>Mass:</b> %s, <b>Charge:</b> %s' %
                   (candidate['Mass'], candidate['Charge']))

    smiles = ''
    inchi = ''
    formula = ''
    if 'Formula' in candidate:
        formula = candidate['Formula'].formula_to_string()
    elif 'FormulaVector' in candidate:
        formula = decode_formula_from_array(
            candidate['FormulaVector']).formula_to_string()

    if formula != '':
        mainHTML.write(', <b>Formula:</b> %s' % formula)

    if merged:
        adduct = ''
        isotope = ''
        if 'Adduct' in candidate:
            adduct = candidate['Adduct']
        if 'Isotope' in candidate:
            isotope = candidate['Isotope']
        mainHTML.write('<br><b>Adduct:</b> %s, <b>Isotope:</b> %s\n' %
                       (adduct, isotope))

    if 'SMILES' in candidate:
        mainHTML.write('<br><b>SMILES:</b> %s\n' % candidate['SMILES'])

    if 'InChI' in candidate:
        mainHTML.write('<br><b>InChI</b>=1S/%s\n' % candidate['InChI'])
        inchi = 'InChI=1S/%s' % candidate['InChI']
        inchikey = inchikey_from_inchi(inchi)
        mainHTML.write('<br><b>InChIKey:</b> %s\n' % inchikey)
        mainHTML.write(
            '<br><button onclick="CallPubChem(this.id)" id="%s">PubChem</button><br>\n'
            % (inchikey))

    if has_image:
        mainHTML.write('</td></tr></table>\n')

    mainHTML.write('</td></tr>\n')