def configure_scorer(self, peak, annotation): peak = annotation.parent_peak if 'csifingerid_count' in peak.parameters: self.csicount = int(peak.parameters['csifingerid_count']) self.csi_fpts = [] self.csi_formulas = [] self.csi_formulascores = [] self.csi_fpt_masks = [] for i in range(1, self.csicount + 1): fscore = float(peak.parameters['csifingerid_score_%s' % i]) if fscore > 0.0: pred_fpt = peak.parameters['csifingerid_predfpt_%s' % i] formula = peak.parameters['csifingerid_formula_%s' % i] mask = peak.parameters['csifingerid_fptmask_%s' % i] #print(len(mask)) #print(mask) mask = np.unpackbits(decode_from_base64(mask)) fpt = pred_fpt.split(',') for i in range(len(fpt)): fpt[i] = float(fpt[i]) fpt = np.array(fpt, dtype=np.float64) full_fpt = np.zeros(mask.shape, dtype=np.float64) full_fpt[mask > 0] = fpt[:] self.csi_fpts.append(full_fpt) self.csi_formulas.append(FormulasFilter(formula)) self.csi_fpt_masks.append(mask) self.csi_formulascores.append(fscore) self.csicount = len(self.csi_fpts) return 0 else: '''if os.path.isfile('e:/Imperial/TestDB/HDF5_CSIFingerID_SmallDB_new/errorlist.txt'): with open('e:/Imperial/TestDB/HDF5_CSIFingerID_SmallDB_new/errorlist.txt', 'a') as fout: fout.write('%s,%s\n'%(peak.parent_spectrum.parameters['inchi'],peak.parent_spectrum.parameters['mode'])); else: with open('e:/Imperial/TestDB/HDF5_CSIFingerID_SmallDB_new/errorlist.txt', 'w') as fout: fout.write('%s,%s\n'%(peak.parent_spectrum.parameters['inchi'],peak.parent_spectrum.parameters['mode'])); ''' return -1
def generate_fpt(index, peak, subpath, mode, mask): fptcount = int(peak.parent_spectrum.parameters['fptcount']) fpt = np.zeros((len(mask), ), dtype=np.float32) for i in range(fptcount): f = peak.parent_spectrum.parameters['fpt_%s' % i] f = decode_from_base64(f) f = np.array(np.unpackbits(f), dtype=np.float32) fpt = np.add(fpt, f) fpt = np.divide(fpt, fptcount) fpt = np.rint(fpt) fpt = np.subtract(np.multiply(fpt, 2), 1) batch = int(peak.parent_spectrum.parameters['crossvalidation_batch_index']) subpath = subpath + '/%s' % batch if not os.path.exists(subpath): os.makedirs(subpath) fname = subpath + '/%s.fpt' % index with open(fname, 'w') as fout: for i in range(len(mask)): if mask[i] > 0: fout.write('%s ' % fpt[i])
def _get_next_raw_record(self): if self.currentfile=='': self.currentfile=os.path.join(self.database_path, self.db_name, self.subf, str(self.mzcurrent//1000), str(self.mzcurrent%1000//100), str(self.mzcurrent%100//10), '%s.st2'%str(self.mzcurrent%10)); while (not os.path.isfile(self.currentfile)) and (self.mzcurrent<=self.mzmax_int): self.mzcurrent+=1; self.currentfile=os.path.join(self.database_path, self.db_name, self.subf, str(self.mzcurrent//1000), str(self.mzcurrent%1000//100), str(self.mzcurrent%100//10), '%s.st2'%str(self.mzcurrent%10)); if self.mzcurrent<=self.mzmax_int: self.datafile=open(self.currentfile,'r'); self.record_index=-1; else: raise StopIteration(); s=self.datafile.readline(); self.record_index+=1; while s=='': self.datafile.close(); self.mzcurrent+=1; self.currentfile=os.path.join(self.database_path, self.db_name, self.subf, str(self.mzcurrent//1000), str(self.mzcurrent%1000//100), str(self.mzcurrent%100//10), '%s.st2'%str(self.mzcurrent%10)); while (not os.path.isfile(self.currentfile)) and (self.mzcurrent<=self.mzmax_int): self.mzcurrent+=1; self.currentfile=os.path.join(self.database_path, self.db_name, self.subf, str(self.mzcurrent//1000), str(self.mzcurrent%1000//100), str(self.mzcurrent%100//10), '%s.st2'%str(self.mzcurrent%10)); if self.mzcurrent<=self.mzmax_int: self.datafile=open(self.currentfile,'r'); self.record_index=-1; else: raise StopIteration(); s=self.datafile.readline(); self.record_index+=1; s=s.rstrip('\n').split('\t'); record=MolecularRecord(); record['MZ']=float(s[0]); if self.charged: record['Mass']=float(s[1]); record['Charge']=float(s[2]); else: record['Mass']=record['MZ']; record['Charge']=0; if 'ShortInChI' in self.required_fields: record['ShortInChI']=parse_inchi(s[2+self.offs])[0]; if 'InChI' in self.required_fields: record['InChI']=s[2+self.offs]; if 'SMILES' in self.required_fields: record['SMILES']=s[3+self.offs]; if 'IDs' in self.required_fields: record['IDs']=s[4+self.offs]; if 'FPT' in self.required_fields: record['FPT']=decode_from_base64(s[5+self.offs]); # Mask FPT here ! if 'Frag' in self.required_fields: record['Frag']=s[6+self.offs]; if self.charged: record['FragCharge']=s[9]; if 'InChIKeyValues' in self.required_fields: record['InChIKeyValues']=inchikeyvalues_from_inchi(s[2+self.offs]); if 'InChIKey' in self.required_fields: record['InChIKey']=inchikey_from_inchi(s[2+self.offs]); if ('Formula' in self.required_fields) or ('ElementVector' in self.required_fields) or ('FormulaVector' in self.required_fields): fla=parse_formula(s[1+self.offs].split('/')[0]); if 'Formula' in self.required_fields: record['Formula']=fla; if 'ElementVector' in self.required_fields: record['ElementVector']=formula_to_element_vector(fla); if 'FormulaVector' in self.required_fields: record['FormulaVector']=encode_formula_to_array(fla); return record;
result=dbmanager.query_by_mz_scored(mz, 20, charge, db_indexes=dbmanager.db_indexes_from_db_names(test_chemical_databases,case_sensitive=False),\ filters=[inchifilter], scorers=[], required_fields=set(['FPT']), results_limit=-1, save_memory=False) ff = [] if len(result.mol_list) == 0: missing += 1 print('Missing: %s' % missing) else: for mol in result.mol_list: ff.append(encode_to_base64(mol['FPT'])) ff = set(ff) spectrum.parameters['FPTCount'] = len(ff) cc = -1 for subfpt in ff: cc += 1 spectrum.parameters['FPT_%s' % cc] = subfpt subfpt = decode_from_base64(subfpt) subfpt = np.unpackbits(subfpt) fpts = np.add(fpts, subfpt) #%% print('Exporting spectra') specmanager.export_textfile_spectra_to_folder(test_spectral_database_outpath) #%% #test_fpt_stat_outfile='e:/Imperial/TestDB/FPT.txt'; print('Exporting FPT stats') fout = open(test_fpt_stat_outfile, 'w') for i in range(11416): fout.write('%s\t%s\n' % (i, fpts[i])) fout.close() #%%
def hdf5_import_from_st2raw(self, inpath, fptmask=np.ones((11416, ), np.uint8)): if not os.path.isfile(os.path.join(inpath, 'dbinfo.dat')): raise IOError('Database info file not found: %s' % os.path.join(inpath, 'dbinfo.dat')) self.HDF5container.attrs['HDF5ContainerType'] = np.string_( 'DistilledChemicalDatabase') self.HDF5container.attrs['HDF5ContainerVersion'] = np.string_('1.0') finp = open(os.path.join(inpath, 'dbinfo.dat'), 'r') #fout=open(os.path.join(self.folderpath,'dbinfo.dat'),'w'); for s in finp: s = s.rstrip('\n').lstrip().split('=', 1) if s[0].upper() == 'DBFORMAT': s[1] = '3' if s[0] != '': #fout.write('%s=%s\n'%(s[0],s[1])); self.HDF5container.attrs[s[0]] = np.string_(s[1]) #fout.close(); finp.close() fptlist = [] for i in range(11416): if fptmask[i] == 1: fptlist.append(i) fptlen = len(fptlist) fptsubmask = np.packbits(np.ones((fptlen, ), np.uint8)) fptmasklen = len(fptsubmask) fptindexes = np.array(fptlist, dtype=np.uint32) packedmask = np.packbits(fptmask) packedmasklen = len(packedmask) #hdf5_ascii_string = h5py.special_dtype(vlen=bytes); fptgroup = self.HDF5container.create_group('FingerPrints') #Original mask, packed fptoriginalmask = fptgroup.create_dataset("FPTOriginalMask", (packedmasklen, ), maxshape=(packedmasklen, ), dtype=np.uint8) fptoriginalmask[:] = packedmask[:] #List of indeces of original FPT bits (11416) fptmask = fptgroup.create_dataset("FPTMask", (fptlen, ), maxshape=(fptlen, ), compression="gzip", compression_opts=4, dtype=np.uint32) fptmask[:] = fptindexes[:] #Mask for working bits (packed) fptsubmask = fptgroup.create_dataset("FPTsubmask", (1, fptmasklen), chunks=(100, fptmasklen), maxshape=(None, fptmasklen), compression="gzip", compression_opts=4, dtype=np.uint8) #FPT info: 0 - original bit count=11416, 1 - length of new fpt after masking, 2 - length of the packed fpt, 3 - No of padding bits fptinfo = fptgroup.create_dataset("FPTInfo", (4, ), maxshape=(4, ), dtype=np.uint32) fptinfo[0] = 11416 fptinfo[1] = fptlen fptinfo[2] = fptmasklen fptinfo[3] = fptmasklen * 8 - fptlen print('Listing input files') subpaths = ['/Negative', '/Positive', '/Neutral'] #subpaths=['/Positive']; for subpath in subpaths: print(subpath) fptgroup = self.HDF5container.create_group(subpath + '/FingerPrints') fraggroup = self.HDF5container.create_group(subpath + '/FragPrints') chemgroup = self.HDF5container.create_group(subpath + '/ChemInfo') chargegroup = self.HDF5container[subpath] #New FPT Array, packed and trimmed to fptmask fptdataset = fptgroup.create_dataset("FPTArray", (1, fptmasklen), chunks=(100, fptmasklen), maxshape=(None, fptmasklen), compression="gzip", compression_opts=4, dtype=np.uint8) if subpath != '/Neutral': masschargedataset = chargegroup.create_dataset( "MZMassCharge", (1, 3), chunks=(10000, 3), maxshape=(None, 3), compression="gzip", compression_opts=4, dtype=np.float32) else: mzdataset = chargegroup.create_dataset("MZ", (1, ), chunks=(10000, ), maxshape=(None, ), compression="gzip", compression_opts=4, dtype=np.float32) inchikey_dataset = chemgroup.create_dataset("InChiKeyValues", (1, 15), chunks=(10000, 15), maxshape=(None, 15), compression="gzip", compression_opts=4, dtype=np.uint8) elementsvector_dataset = chemgroup.create_dataset( "ElementsVector", (1, 12), chunks=(10000, 12), maxshape=(None, 12), compression="gzip", compression_opts=4, dtype=np.uint8) formulavector_dataset = chemgroup.create_dataset( "FormulaVector", (1, 96), chunks=(10000, 96), maxshape=(None, 96), compression="gzip", compression_opts=4, dtype=np.uint16) fragprintindex_dataset = fraggroup.create_dataset( "FragPrintIndex", (1, 2), chunks=(10000, 2), maxshape=(None, 2), compression="gzip", compression_opts=4, dtype=np.int64) fragprintvalues_dataset = fraggroup.create_dataset( "FragPrintValues", (1, ), chunks=(10000, ), maxshape=(None, ), compression="gzip", compression_opts=4, dtype=np.float32) smiles_dataset = chemgroup.create_dataset("SMILES", (1, 2), chunks=(10000, 2), maxshape=(None, 2), compression="gzip", compression_opts=4, dtype=np.int64) inchi_dataset = chemgroup.create_dataset("InChi", (1, 4, 2), chunks=(10000, 4, 2), maxshape=(None, 4, 2), compression="gzip", compression_opts=4, dtype=np.int64) ids_dataset = chemgroup.create_dataset("IDs", (1, 2), chunks=(10000, 2), maxshape=(None, 2), compression="gzip", compression_opts=4, dtype=np.int64) ascii_dataset = chemgroup.create_dataset("ASCII", (1, ), chunks=(10000, ), maxshape=(None, ), compression="gzip", compression_opts=4, dtype=np.uint8) recordindex = -1 fileslist = [] for i in range(0, 2000): if os.path.exists(inpath + subpath + '/%s' % i): print(inpath + subpath + '/%s' % i) for j in range(0, 10): if os.path.exists(inpath + subpath + '/%s/%s' % (i, j)): for k in range(0, 10): if os.path.exists(inpath + subpath + '/%s/%s/%s' % (i, j, k)): for l in range(0, 10): if os.path.isfile(inpath + subpath + '/%s/%s/%s/%s.st2' % (i, j, k, l)): fileslist.append( inpath + subpath + '/%s/%s/%s/%s.st2' % (i, j, k, l)) print('Total number of input files: %s' % len(fileslist)) for filename in fileslist: fpath, fname = os.path.split(filename) subpath = fpath.replace(inpath, '') if 'Neutral' in subpath: charged = False offs = 0 else: charged = True offs = 2 print('Importing: .../%s/%s' % (subpath, fname)) dblist = [] with open(filename, 'r') as finp: for s in finp: try: s = s.replace('\n', '').replace('\r', '').split('\t') mz = float(s[0]) if charged: mass = float(s[1]) charge = float(s[2]) else: mass = mz charge = 0.0 if mass >= 12.0: #shortinchi=s[1+offs]; inchi = s[2 + offs] smiles = s[3 + offs] ids = s[4 + offs] fpt = s[5 + offs] fpt = decode_from_base64(fpt) fpt = np.unpackbits(fpt) frag = s[6 + offs] if charged: fragcharge = s[9] else: fragcharge = '' recordindex += 1 if recordindex % 1000 == 0: print('Total: %s' % (recordindex + 1)) dblist.append([ recordindex, mz, charged, mass, charge, inchi, fpt, frag, fragcharge, smiles, ids ]) except: print('Error! Skipping!') if len(dblist) > 0: #expand datasets here fptdataset.resize((recordindex + 1, fptmasklen)) if charged: masschargedataset.resize((recordindex + 1, 3)) else: mzdataset.resize((recordindex + 1, )) inchikey_dataset.resize((recordindex + 1, 15)) elementsvector_dataset.resize((recordindex + 1, 12)) formulavector_dataset.resize((recordindex + 1, 96)) fragprintindex_dataset.resize((recordindex + 1, 2)) smiles_dataset.resize((recordindex + 1, 2)) ids_dataset.resize((recordindex + 1, 2)) inchi_dataset.resize((recordindex + 1, 4, 2)) for db in dblist: currentindex = db[0] fptdataset[currentindex, :] = np.packbits( db[6][fptindexes])[:] #print(inchi) inchi = parse_inchi(db[5]) #print(inchi) inchikeyvalues = inchikeyvalues_from_inchi(db[5]) sformula = inchi[0].split('/', 1)[0] #print(sformula); formula = parse_formula(sformula) elementsvector = formula_to_element_vector(formula) encodedformula = encode_formula_to_array(formula) charge = db[4] charged = db[2] if charged: #print(db[7],db[8]) frags = parse_string_fragment_charges( charge, db[7], db[8]) #print(frags) else: frags = parse_string_fragments(db[7]) if charged: masschargedataset[currentindex, 0] = db[1] masschargedataset[currentindex, 1] = db[3] masschargedataset[currentindex, 2] = charge else: mzdataset[currentindex] = db[1] inchikey_dataset[currentindex, :] = inchikeyvalues[:] elementsvector_dataset[ currentindex, :] = elementsvector[:] formulavector_dataset[ currentindex, :] = encodedformula[:] fragcount = len(frags) frags = np.array(frags, dtype=np.float32) fragprintindex_dataset[currentindex, 0] = self.fragprintpos fragprintindex_dataset[ currentindex, 1] = self.fragprintpos + fragcount fragprintvalues_dataset.resize( (self.fragprintpos + fragcount, )) fragprintvalues_dataset[self.fragprintpos:self. fragprintpos + fragcount] = frags[:] self.fragprintpos += fragcount smiles = bytearray(db[9].encode('ascii')) smileslen = len(smiles) smiles = np.array(smiles, dtype=np.uint8) ids = bytearray(db[10].encode('ascii')) idslen = len(ids) ids = np.array(ids, dtype=np.uint8) sinchi = inchi[0].split('/', 1) if len(sinchi) > 1: sinchi = sinchi[1] else: sinchi = '' inchi0 = bytearray(sformula.encode('ascii')) inchi1 = bytearray(sinchi.encode('ascii')) inchi2 = bytearray(inchi[2].encode('ascii')) inchi3 = bytearray(inchi[1].encode('ascii')) inchi0len = len(inchi0) inchi1len = len(inchi1) inchi2len = len(inchi2) inchi3len = len(inchi3) inchi0 = np.array(inchi0, dtype=np.uint8) inchi1 = np.array(inchi1, dtype=np.uint8) inchi2 = np.array(inchi2, dtype=np.uint8) inchi3 = np.array(inchi3, dtype=np.uint8) ascii_dataset.resize( (self.asciipos + smileslen + idslen + inchi0len + inchi1len + inchi2len + inchi3len, )) smiles_dataset[currentindex, 0] = self.asciipos smiles_dataset[currentindex, 1] = self.asciipos + smileslen ascii_dataset[self.asciipos:self.asciipos + smileslen] = smiles[:] self.asciipos += smileslen ids_dataset[currentindex, 0] = self.asciipos ids_dataset[currentindex, 1] = self.asciipos + idslen ascii_dataset[self.asciipos:self.asciipos + idslen] = ids[:] self.asciipos += idslen inchi_dataset[currentindex, 0, 0] = self.asciipos inchi_dataset[currentindex, 0, 1] = self.asciipos + inchi0len ascii_dataset[self.asciipos:self.asciipos + inchi0len] = inchi0[:] self.asciipos += inchi0len inchi_dataset[currentindex, 1, 0] = self.asciipos inchi_dataset[currentindex, 1, 1] = self.asciipos + inchi1len ascii_dataset[self.asciipos:self.asciipos + inchi1len] = inchi1[:] self.asciipos += inchi1len inchi_dataset[currentindex, 2, 0] = self.asciipos inchi_dataset[currentindex, 2, 1] = self.asciipos + inchi2len ascii_dataset[self.asciipos:self.asciipos + inchi2len] = inchi2[:] self.asciipos += inchi2len inchi_dataset[currentindex, 3, 0] = self.asciipos inchi_dataset[currentindex, 3, 1] = self.asciipos + inchi3len ascii_dataset[self.asciipos:self.asciipos + inchi3len] = inchi3[:] self.asciipos += inchi3len print('Import Finished!')
def _pipe_from_textfile(self, finp): while True: s = finp.readline() if s == '': return s = s.rstrip('\n').lstrip() if '##' in s: s = s[:s.index('##')] if '=' in s: s = s.split('=', 1) if s[0].lower().startswith('totalscore'): self['TotalScore'] = float(s[1]) elif s[0].lower().startswith('adduct'): self['Adduct'] = s[1] elif s[0].lower().startswith('isotopeextramass'): self['IsotopeExtraMass'] = float(s[1]) elif s[0].lower().startswith('isotope'): self['Isotope'] = int(s[1]) elif s[0].lower().startswith('mz'): self['MZ'] = float(s[1]) elif s[0].lower().startswith('mass'): self['Mass'] = float(s[1]) elif s[0].lower().startswith('charge'): self['Charge'] = int(s[1]) elif s[0].lower().startswith('dbformat'): self['DBFormat'] = int(s[1]) elif s[0].lower().startswith('dbindex'): self['DBIndex'] = int(s[1]) elif s[0].lower().startswith('rindex'): self['RIndex'] = int(s[1]) elif s[0].lower().startswith('dbname'): self['DBName'] = s[1] elif s[0].lower().startswith('rfile'): self['RFile'] = s[1] elif s[0].lower().startswith('smiles'): self['SMILES'] = s[1] elif s[0].lower().startswith('ids'): self['IDs'] = s[1] elif s[0].lower().startswith('annotation'): self['Annotation'] = s[1] elif s[0].lower().startswith('shortinchi'): self['ShortInChI'] = s[1] elif s[0].lower().startswith('inchikeyvalues'): self['InChIKeyValues'] = string_to_numpy_byte_array(s[1]) elif s[0].lower().startswith('inchikey'): self['InChiKey'] = s[1] elif s[0].lower().startswith('inchi'): self['InChI'] = s[1] elif s[0].lower().startswith('formulavector'): self['FormulaVector'] = string_to_numpy_uint16_array(s[1]) elif s[0].lower().startswith('elementvector'): self['ElementVector'] = string_to_numpy_byte_array(s[1]) elif s[0].lower().startswith('frag'): self['Frag'] = string_to_float_list(s[1]) elif s[0].lower().startswith('formula'): self['Formula'] = parse_formula(s[1]) elif s[0].lower().startswith('fpt'): self['FPT'] = decode_from_base64(s[1]) elif s[0].lower().startswith('scores'): if not ('Scores' in self): self['Scores'] = {} score = s[1].split(':', 1) self['Scores'][score[0]] = float(score[1]) elif s.lower().startswith('end'): return
positivebatch = [] negativebatch = [] scount = 0 for spectrum in specmanager.ms_spectra: scount += 1 print('%s of %s' % (scount, len(specmanager.ms_spectra))) fpts = [] fptcount = int(spectrum.parameters['fptcount']) ff = set() for i in range(fptcount): ff.add(spectrum.parameters['fpt_%s' % i]) for fpt in ff: fpts.append(np.unpackbits(decode_from_base64(fpt))) batch = int(spectrum.parameters['crossvalidation_batch_index']) for peak in spectrum.peaks: ion_type = '' if 'ion_type' in peak.parameters: ion_type = peak.parameters['ion_type'] elif peak.ms_spectra: ion_type = peak.ms_spectra[0].parameters['precursor_ion'] if ion_type == '[M+H]+' or ion_type == '[M-H]-': subcount = 0 vector = np.zeros((20000, 1), dtype=np.float32) values = [] for subspectrum in peak.ms_spectra: if subspectrum.parameters['level'] == 2: subspectrum.normalize_to_one()