def _pipe_from_textfile(self, finp): while True: s = finp.readline() if s == '': return s = s.rstrip('\n').lstrip() if '##' in s: s = s[:s.index('##')] if '=' in s: s = s.split('=', 1) if s[0].lower().startswith('formula'): s = s[1].split(',') self.formulas.append( encode_formula_to_array(parse_formula(s[0]))) self.scores.append(float(s[1])) elif s[0].lower().startswith('unknown_score'): self.unknown_score = float(s[1]) elif s.lower().startswith('dict_form'): self.vector_form = False elif s.lower().startswith('end'): if not self.vector_form: self.required_fields = ['Formula'] for i in range(len(self.formulas)): self.formulas[i] = decode_formula_from_array( self.formulas[i]) self.process_molecular_candidate_record = self.__process_molecular_candidate_record_formula return
def _pipe_from_textfile(self, finp): while True: s = finp.readline() if s == '': return s = s.rstrip('\n').lstrip() if '##' in s: s = s[:s.index('##')] if '=' in s: s = s.split('=', 1) if s[0].lower().startswith('formula'): self.filter.append( encode_formula_to_array(parse_formula(s[1]))) elif s.lower().startswith('dict_form'): self.vector_form = False elif s.lower().startswith('end'): if not self.vector_form: self.required_fields = ['Formula'] for i in range(len(self.filter)): self.filter[i] = decode_formula_from_array( self.filter[i]) self.rejected = self.__rejected_formula return
def __init__(self, formulas=None, use_vector_form=True): self.filter = [] self.vector_form = use_vector_form #self.supported_adducts=set(); if self.vector_form: self.required_fields = ['FormulaVector'] else: self.required_fields = ['Formula'] if formulas is None: return if isinstance(formulas, str): formulas = formulas.split(',') for formula in formulas: self.filter.append( encode_formula_to_array(parse_formula(formula))) elif isinstance(formulas, list): for formula in formulas: if isinstance(formula, dict): self.filter.append(encode_formula_to_array(formula)) elif isinstance(formula, np.ndarray): self.filter.append(formula) elif isinstance(formula, str): fs = formula.split(',') for f in fs: self.filter.append( encode_formula_to_array(parse_formula(f))) else: raise TypeError( 'Wrong type argument for FormulasFilter initialization! str, dict, list of (dict or str) supported only!' ) elif isinstance(formulas, dict): self.filter.append(encode_formula_to_array(formulas)) elif isinstance(formulas, np.ndarray): self.filter.append(formulas) else: raise TypeError( 'Wrong type argument for FormulasFilter initialization! str, dict, list of (dict or str) supported only!' ) if not self.vector_form: for i in range(len(self.filter)): self.filter[i] = decode_formula_from_array(self.filter[i])
def process_file(fname, correct_key, best_results, worst_results, correct_formula): min_correct = -1 max_correct = -1 correct_elements = ElementCompositionFilter(correct_formula, correct_formula) correct_formula = FormulasFilter(correct_formula) with open(fname, 'rb') as finp: results = list(csv.reader(finp)) del results[0] for i in reversed(range(1, len(results))): if results[i][9] == results[i - 1][9]: del results[i] elif correct_elements.rejected({ 'ElementVector': formula_to_element_vector(parse_formula(results[i][6])) }): #print('removed wrong elements %s %s'%(results[i][6], correct_formula)) del results[i] elif correct_formula.rejected({ 'FormulaVector': encode_formula_to_array(parse_formula(results[i][6])) }): #print('removed wrong formulas %s %s'%(results[i][6], correct_formula)) del results[i] for i in range(len(results)): if correct_key == results[i][9]: if min_correct == -1: min_correct = i max_correct = i if min_correct > -1: for i in range(len(best_results)): if min_correct <= i: best_results[i] += 1 if max_correct <= i: worst_results[i] += 1
def _get_next_raw_record(self): if self.currentfile=='': self.currentfile=os.path.join(self.database_path, self.db_name, self.subf, str(self.mzcurrent//1000), str(self.mzcurrent%1000//100), str(self.mzcurrent%100//10), '%s.st2'%str(self.mzcurrent%10)); while (not os.path.isfile(self.currentfile)) and (self.mzcurrent<=self.mzmax_int): self.mzcurrent+=1; self.currentfile=os.path.join(self.database_path, self.db_name, self.subf, str(self.mzcurrent//1000), str(self.mzcurrent%1000//100), str(self.mzcurrent%100//10), '%s.st2'%str(self.mzcurrent%10)); if self.mzcurrent<=self.mzmax_int: self.datafile=open(self.currentfile,'r'); self.record_index=-1; else: raise StopIteration(); s=self.datafile.readline(); self.record_index+=1; while s=='': self.datafile.close(); self.mzcurrent+=1; self.currentfile=os.path.join(self.database_path, self.db_name, self.subf, str(self.mzcurrent//1000), str(self.mzcurrent%1000//100), str(self.mzcurrent%100//10), '%s.st2'%str(self.mzcurrent%10)); while (not os.path.isfile(self.currentfile)) and (self.mzcurrent<=self.mzmax_int): self.mzcurrent+=1; self.currentfile=os.path.join(self.database_path, self.db_name, self.subf, str(self.mzcurrent//1000), str(self.mzcurrent%1000//100), str(self.mzcurrent%100//10), '%s.st2'%str(self.mzcurrent%10)); if self.mzcurrent<=self.mzmax_int: self.datafile=open(self.currentfile,'r'); self.record_index=-1; else: raise StopIteration(); s=self.datafile.readline(); self.record_index+=1; s=s.rstrip('\n').split('\t'); record=MolecularRecord(); record['MZ']=float(s[0]); if self.charged: record['Mass']=float(s[1]); record['Charge']=float(s[2]); else: record['Mass']=record['MZ']; record['Charge']=0; if 'ShortInChI' in self.required_fields: record['ShortInChI']=parse_inchi(s[2+self.offs])[0]; if 'InChI' in self.required_fields: record['InChI']=s[2+self.offs]; if 'SMILES' in self.required_fields: record['SMILES']=s[3+self.offs]; if 'IDs' in self.required_fields: record['IDs']=s[4+self.offs]; if 'FPT' in self.required_fields: record['FPT']=decode_from_base64(s[5+self.offs]); # Mask FPT here ! if 'Frag' in self.required_fields: record['Frag']=s[6+self.offs]; if self.charged: record['FragCharge']=s[9]; if 'InChIKeyValues' in self.required_fields: record['InChIKeyValues']=inchikeyvalues_from_inchi(s[2+self.offs]); if 'InChIKey' in self.required_fields: record['InChIKey']=inchikey_from_inchi(s[2+self.offs]); if ('Formula' in self.required_fields) or ('ElementVector' in self.required_fields) or ('FormulaVector' in self.required_fields): fla=parse_formula(s[1+self.offs].split('/')[0]); if 'Formula' in self.required_fields: record['Formula']=fla; if 'ElementVector' in self.required_fields: record['ElementVector']=formula_to_element_vector(fla); if 'FormulaVector' in self.required_fields: record['FormulaVector']=encode_formula_to_array(fla); return record;
def setup_scorer(self, formulas, scores, unknown_score=0.0): self.formulas = [] self.scores = [] self.unknown_score = unknown_score if isinstance(formulas, str): formulas = formulas.split(',') for formula in formulas: self.formulas.append( encode_formula_to_array(parse_formula(formula))) elif isinstance(formulas, list): for formula in formulas: if isinstance(formula, dict): self.formulas.append(encode_formula_to_array(formula)) elif isinstance(formula, np.ndarray): self.formulas.append(formula) elif isinstance(formula, str): fs = formula.split(',') for f in fs: self.formulas.append( encode_formula_to_array(parse_formula(f))) else: raise TypeError( 'Wrong type argument (formulas) for FormulaVectors initialization! str, formula, list of (formula, formulavector or str) supported only!' ) elif isinstance(formulas, dict): self.formulas.append(encode_formula_to_array(formulas)) elif isinstance(formulas, np.ndarray): self.formulas.append(formulas) else: raise TypeError( 'Wrong type argument (formulas) for FormulaVectors initialization! str, formula, list of (formula, formulavector or str) supported only!' ) if isinstance(scores, str): scores = scores.split(',') for score in scores: self.scores.append(float(score)) elif isinstance(scores, float): self.scores.append(scores) elif isinstance(scores, int): self.scores.append(float(scores)) elif isinstance(scores, list): for score in scores: if isinstance(score, float): self.scores.append(score) elif isinstance(score, dict): self.scores.append(score) elif isinstance(score, int): self.scores.append(float(score)) elif isinstance(score, str): score = score.split(',') for s in score: self.scores.append(float(s)) else: raise TypeError( 'Wrong type argument (scores) for FormulaScorer initialization! str, float, int, list of (float, int or str) or dictionary supported only!' ) else: raise TypeError( 'Wrong type argument (scores) for FormulaScorer initialization! str, float, int, list of (float, int or str) supported only!' ) if len(self.scores) != len(self.formulas): raise TypeError( 'Number of formulas and number of scores supplied do not match!' ) if not self.vector_form: for i in range(len(self.formulas)): self.formulas[i] = decode_formula_from_array(self.formulas[i])
} test.process_molecular_candidate_record(None, record) print(record) record = { 'Formula': parse_formula('O4'), 'Scores': {} } test.process_molecular_candidate_record(None, record) print(record) print('FormulaVector') test = FormulaScorer() test.setup_scorer('C2H5OH,CH4,PO4', '0.3,0.1,0.5', 1.0) record = { 'FormulaVector': encode_formula_to_array(parse_formula('CH4')), 'Scores': {} } test.process_molecular_candidate_record(None, record) print(record) record = { 'FormulaVector': encode_formula_to_array(parse_formula('PO4')), 'Scores': {} } test.process_molecular_candidate_record(None, record) print(record) record = { 'FormulaVector': encode_formula_to_array(parse_formula('C2H5OH')), 'Scores': {}
print(test.rejected({'Formula': parse_formula('CH4N')})) #True print(test.rejected({'Formula': parse_formula('CH3')})) #True print(test.rejected({'Formula': parse_formula('C2H4')})) #True print(test.rejected({'Formula': parse_formula('C')})) #True print(test.rejected({'Formula': parse_formula('CO4')})) #True print('FormulasVector') test = FormulasFilter('C2H5OH') print(test.filter) test = FormulasFilter('C2H5OH,CH4,PO4') print(test.filter) test = FormulasFilter(encode_formula_to_array(parse_formula('C2H5OH'))) print(test.filter) test = FormulasFilter(['C2H5OH,PO4', parse_formula('CH4')]) print(test.filter) print( test.rejected( {'FormulaVector': encode_formula_to_array(parse_formula('CH4'))})) #False print( test.rejected( {'FormulaVector': encode_formula_to_array(parse_formula('PO4'))})) #False print( test.rejected( {'FormulaVector': encode_formula_to_array(parse_formula('H4C'))}))
def hdf5_import_from_st2raw(self, inpath, fptmask=np.ones((11416, ), np.uint8)): if not os.path.isfile(os.path.join(inpath, 'dbinfo.dat')): raise IOError('Database info file not found: %s' % os.path.join(inpath, 'dbinfo.dat')) self.HDF5container.attrs['HDF5ContainerType'] = np.string_( 'DistilledChemicalDatabase') self.HDF5container.attrs['HDF5ContainerVersion'] = np.string_('1.0') finp = open(os.path.join(inpath, 'dbinfo.dat'), 'r') #fout=open(os.path.join(self.folderpath,'dbinfo.dat'),'w'); for s in finp: s = s.rstrip('\n').lstrip().split('=', 1) if s[0].upper() == 'DBFORMAT': s[1] = '3' if s[0] != '': #fout.write('%s=%s\n'%(s[0],s[1])); self.HDF5container.attrs[s[0]] = np.string_(s[1]) #fout.close(); finp.close() fptlist = [] for i in range(11416): if fptmask[i] == 1: fptlist.append(i) fptlen = len(fptlist) fptsubmask = np.packbits(np.ones((fptlen, ), np.uint8)) fptmasklen = len(fptsubmask) fptindexes = np.array(fptlist, dtype=np.uint32) packedmask = np.packbits(fptmask) packedmasklen = len(packedmask) #hdf5_ascii_string = h5py.special_dtype(vlen=bytes); fptgroup = self.HDF5container.create_group('FingerPrints') #Original mask, packed fptoriginalmask = fptgroup.create_dataset("FPTOriginalMask", (packedmasklen, ), maxshape=(packedmasklen, ), dtype=np.uint8) fptoriginalmask[:] = packedmask[:] #List of indeces of original FPT bits (11416) fptmask = fptgroup.create_dataset("FPTMask", (fptlen, ), maxshape=(fptlen, ), compression="gzip", compression_opts=4, dtype=np.uint32) fptmask[:] = fptindexes[:] #Mask for working bits (packed) fptsubmask = fptgroup.create_dataset("FPTsubmask", (1, fptmasklen), chunks=(100, fptmasklen), maxshape=(None, fptmasklen), compression="gzip", compression_opts=4, dtype=np.uint8) #FPT info: 0 - original bit count=11416, 1 - length of new fpt after masking, 2 - length of the packed fpt, 3 - No of padding bits fptinfo = fptgroup.create_dataset("FPTInfo", (4, ), maxshape=(4, ), dtype=np.uint32) fptinfo[0] = 11416 fptinfo[1] = fptlen fptinfo[2] = fptmasklen fptinfo[3] = fptmasklen * 8 - fptlen print('Listing input files') subpaths = ['/Negative', '/Positive', '/Neutral'] #subpaths=['/Positive']; for subpath in subpaths: print(subpath) fptgroup = self.HDF5container.create_group(subpath + '/FingerPrints') fraggroup = self.HDF5container.create_group(subpath + '/FragPrints') chemgroup = self.HDF5container.create_group(subpath + '/ChemInfo') chargegroup = self.HDF5container[subpath] #New FPT Array, packed and trimmed to fptmask fptdataset = fptgroup.create_dataset("FPTArray", (1, fptmasklen), chunks=(100, fptmasklen), maxshape=(None, fptmasklen), compression="gzip", compression_opts=4, dtype=np.uint8) if subpath != '/Neutral': masschargedataset = chargegroup.create_dataset( "MZMassCharge", (1, 3), chunks=(10000, 3), maxshape=(None, 3), compression="gzip", compression_opts=4, dtype=np.float32) else: mzdataset = chargegroup.create_dataset("MZ", (1, ), chunks=(10000, ), maxshape=(None, ), compression="gzip", compression_opts=4, dtype=np.float32) inchikey_dataset = chemgroup.create_dataset("InChiKeyValues", (1, 15), chunks=(10000, 15), maxshape=(None, 15), compression="gzip", compression_opts=4, dtype=np.uint8) elementsvector_dataset = chemgroup.create_dataset( "ElementsVector", (1, 12), chunks=(10000, 12), maxshape=(None, 12), compression="gzip", compression_opts=4, dtype=np.uint8) formulavector_dataset = chemgroup.create_dataset( "FormulaVector", (1, 96), chunks=(10000, 96), maxshape=(None, 96), compression="gzip", compression_opts=4, dtype=np.uint16) fragprintindex_dataset = fraggroup.create_dataset( "FragPrintIndex", (1, 2), chunks=(10000, 2), maxshape=(None, 2), compression="gzip", compression_opts=4, dtype=np.int64) fragprintvalues_dataset = fraggroup.create_dataset( "FragPrintValues", (1, ), chunks=(10000, ), maxshape=(None, ), compression="gzip", compression_opts=4, dtype=np.float32) smiles_dataset = chemgroup.create_dataset("SMILES", (1, 2), chunks=(10000, 2), maxshape=(None, 2), compression="gzip", compression_opts=4, dtype=np.int64) inchi_dataset = chemgroup.create_dataset("InChi", (1, 4, 2), chunks=(10000, 4, 2), maxshape=(None, 4, 2), compression="gzip", compression_opts=4, dtype=np.int64) ids_dataset = chemgroup.create_dataset("IDs", (1, 2), chunks=(10000, 2), maxshape=(None, 2), compression="gzip", compression_opts=4, dtype=np.int64) ascii_dataset = chemgroup.create_dataset("ASCII", (1, ), chunks=(10000, ), maxshape=(None, ), compression="gzip", compression_opts=4, dtype=np.uint8) recordindex = -1 fileslist = [] for i in range(0, 2000): if os.path.exists(inpath + subpath + '/%s' % i): print(inpath + subpath + '/%s' % i) for j in range(0, 10): if os.path.exists(inpath + subpath + '/%s/%s' % (i, j)): for k in range(0, 10): if os.path.exists(inpath + subpath + '/%s/%s/%s' % (i, j, k)): for l in range(0, 10): if os.path.isfile(inpath + subpath + '/%s/%s/%s/%s.st2' % (i, j, k, l)): fileslist.append( inpath + subpath + '/%s/%s/%s/%s.st2' % (i, j, k, l)) print('Total number of input files: %s' % len(fileslist)) for filename in fileslist: fpath, fname = os.path.split(filename) subpath = fpath.replace(inpath, '') if 'Neutral' in subpath: charged = False offs = 0 else: charged = True offs = 2 print('Importing: .../%s/%s' % (subpath, fname)) dblist = [] with open(filename, 'r') as finp: for s in finp: try: s = s.replace('\n', '').replace('\r', '').split('\t') mz = float(s[0]) if charged: mass = float(s[1]) charge = float(s[2]) else: mass = mz charge = 0.0 if mass >= 12.0: #shortinchi=s[1+offs]; inchi = s[2 + offs] smiles = s[3 + offs] ids = s[4 + offs] fpt = s[5 + offs] fpt = decode_from_base64(fpt) fpt = np.unpackbits(fpt) frag = s[6 + offs] if charged: fragcharge = s[9] else: fragcharge = '' recordindex += 1 if recordindex % 1000 == 0: print('Total: %s' % (recordindex + 1)) dblist.append([ recordindex, mz, charged, mass, charge, inchi, fpt, frag, fragcharge, smiles, ids ]) except: print('Error! Skipping!') if len(dblist) > 0: #expand datasets here fptdataset.resize((recordindex + 1, fptmasklen)) if charged: masschargedataset.resize((recordindex + 1, 3)) else: mzdataset.resize((recordindex + 1, )) inchikey_dataset.resize((recordindex + 1, 15)) elementsvector_dataset.resize((recordindex + 1, 12)) formulavector_dataset.resize((recordindex + 1, 96)) fragprintindex_dataset.resize((recordindex + 1, 2)) smiles_dataset.resize((recordindex + 1, 2)) ids_dataset.resize((recordindex + 1, 2)) inchi_dataset.resize((recordindex + 1, 4, 2)) for db in dblist: currentindex = db[0] fptdataset[currentindex, :] = np.packbits( db[6][fptindexes])[:] #print(inchi) inchi = parse_inchi(db[5]) #print(inchi) inchikeyvalues = inchikeyvalues_from_inchi(db[5]) sformula = inchi[0].split('/', 1)[0] #print(sformula); formula = parse_formula(sformula) elementsvector = formula_to_element_vector(formula) encodedformula = encode_formula_to_array(formula) charge = db[4] charged = db[2] if charged: #print(db[7],db[8]) frags = parse_string_fragment_charges( charge, db[7], db[8]) #print(frags) else: frags = parse_string_fragments(db[7]) if charged: masschargedataset[currentindex, 0] = db[1] masschargedataset[currentindex, 1] = db[3] masschargedataset[currentindex, 2] = charge else: mzdataset[currentindex] = db[1] inchikey_dataset[currentindex, :] = inchikeyvalues[:] elementsvector_dataset[ currentindex, :] = elementsvector[:] formulavector_dataset[ currentindex, :] = encodedformula[:] fragcount = len(frags) frags = np.array(frags, dtype=np.float32) fragprintindex_dataset[currentindex, 0] = self.fragprintpos fragprintindex_dataset[ currentindex, 1] = self.fragprintpos + fragcount fragprintvalues_dataset.resize( (self.fragprintpos + fragcount, )) fragprintvalues_dataset[self.fragprintpos:self. fragprintpos + fragcount] = frags[:] self.fragprintpos += fragcount smiles = bytearray(db[9].encode('ascii')) smileslen = len(smiles) smiles = np.array(smiles, dtype=np.uint8) ids = bytearray(db[10].encode('ascii')) idslen = len(ids) ids = np.array(ids, dtype=np.uint8) sinchi = inchi[0].split('/', 1) if len(sinchi) > 1: sinchi = sinchi[1] else: sinchi = '' inchi0 = bytearray(sformula.encode('ascii')) inchi1 = bytearray(sinchi.encode('ascii')) inchi2 = bytearray(inchi[2].encode('ascii')) inchi3 = bytearray(inchi[1].encode('ascii')) inchi0len = len(inchi0) inchi1len = len(inchi1) inchi2len = len(inchi2) inchi3len = len(inchi3) inchi0 = np.array(inchi0, dtype=np.uint8) inchi1 = np.array(inchi1, dtype=np.uint8) inchi2 = np.array(inchi2, dtype=np.uint8) inchi3 = np.array(inchi3, dtype=np.uint8) ascii_dataset.resize( (self.asciipos + smileslen + idslen + inchi0len + inchi1len + inchi2len + inchi3len, )) smiles_dataset[currentindex, 0] = self.asciipos smiles_dataset[currentindex, 1] = self.asciipos + smileslen ascii_dataset[self.asciipos:self.asciipos + smileslen] = smiles[:] self.asciipos += smileslen ids_dataset[currentindex, 0] = self.asciipos ids_dataset[currentindex, 1] = self.asciipos + idslen ascii_dataset[self.asciipos:self.asciipos + idslen] = ids[:] self.asciipos += idslen inchi_dataset[currentindex, 0, 0] = self.asciipos inchi_dataset[currentindex, 0, 1] = self.asciipos + inchi0len ascii_dataset[self.asciipos:self.asciipos + inchi0len] = inchi0[:] self.asciipos += inchi0len inchi_dataset[currentindex, 1, 0] = self.asciipos inchi_dataset[currentindex, 1, 1] = self.asciipos + inchi1len ascii_dataset[self.asciipos:self.asciipos + inchi1len] = inchi1[:] self.asciipos += inchi1len inchi_dataset[currentindex, 2, 0] = self.asciipos inchi_dataset[currentindex, 2, 1] = self.asciipos + inchi2len ascii_dataset[self.asciipos:self.asciipos + inchi2len] = inchi2[:] self.asciipos += inchi2len inchi_dataset[currentindex, 3, 0] = self.asciipos inchi_dataset[currentindex, 3, 1] = self.asciipos + inchi3len ascii_dataset[self.asciipos:self.asciipos + inchi3len] = inchi3[:] self.asciipos += inchi3len print('Import Finished!')