def get_pubchem_sqlite_local(pubchem_id): if not hasattr(settings, 'METAB_PUBCHEM_SQLITE_PTH' ) and not settings.METAB_PUBCHEM_SQLITE_PTH: return '' if not pubchem_id: return '' conn = sqlite3.connect(settings.METAB_PUBCHEM_SQLITE_PTH) cursor = conn.cursor() try: cursor.execute( 'SELECT * FROM pubchem_compounds WHERE cid={}'.format(pubchem_id)) names = sql_column_names(cursor) rows = cursor.fetchall() except OperationalError as e: print(e) return '' if rows: # should only be 1 entrie per cid so take first row return create_compound_from_pubchem_local(rows[0], names) else: return ''
def save_xcms_individual_peaks(self, xfid): cursor = self.cursor cursor.execute('SELECT * FROM c_peaks') names = sql_column_names(cursor) cpeaks = [] for row in cursor: if len(cpeaks) % 500 == 0: CPeak.objects.bulk_create(cpeaks) cpeaks = [] cpeak = CPeak(idi=row[names['cid']], mz=row[names['mz']], mzmin=row[names['mzmin']], mzmax=row[names['mzmax']], rt=row[names['rt']], rtmin=row[names['rtmin']], rtmax=row[names['rtmax']], rtminraw=row[names['rtminraw']] if 'rtminraw' in names else None, rtmaxraw=row[names['rtmaxraw']] if 'rtmaxraw' in names else None, intb=row[names['intb']] if 'intb' in names else None, _into=row[names['_into']], maxo=row[names['maxo']], sn=row[names['sn']] if 'sn' in names else None, xcmsfileinfo=xfid[row[names['fileid']]]) cpeaks.append(cpeak) CPeak.objects.bulk_create(cpeaks)
def save_xcms_grouped_peaks(self): md = self.md cursor = self.cursor cursor.execute('SELECT * FROM c_peak_groups') names = sql_column_names(cursor) cpeakgroups = [] cpeakgroup_d = {} for row in cursor: if len(cpeakgroups) % 500 == 0: CPeakGroup.objects.bulk_create(cpeakgroups) cpeakgroups = [] cpeakgroup = CPeakGroup( idi=row[names['grpid']], mzmed=row[names['mz']], mzmin=row[names['mzmin']], mzmax=row[names['mzmax']], rtmed=row[names['rt']], rtmin=row[names['rtmin']], rtmax=row[names['rtmax']], npeaks=row[names['npeaks']], cpeakgroupmeta=self.cpgm, isotopes=row[names['isotopes']] if 'isotopes' in names else None, adducts=row[names['adduct']] if 'adduct' in names else None, pcgroup=row[names['pcgroup']] if 'pcgroup' in names else None, ) cpeakgroups.append(cpeakgroup) cpeakgroup_d[row[names['grpid']]] = cpeakgroup CPeakGroup.objects.bulk_create(cpeakgroups)
def save_adduct_annotations(self, ruleset_d): md = self.md cursor = self.cursor if not check_table_exists_sqlite(cursor, 'adduct_annotations'): return 0 nm_d = { n.idi: n.id for n in NeutralMass.objects.filter(metabinputdata=md) } cpeakgroups_d = { c.idi: c.pk for c in CPeakGroup.objects.filter(cpeakgroupmeta=self.cpgm) } cursor.execute('SELECT * FROM adduct_annotations') names = sql_column_names(cursor) ads = [] for row in cursor: if len(row) % 500 == 0: Adduct.objects.bulk_create(ads) ads = [] ad = Adduct(idi=row[names['add_id']], adductrule_id=ruleset_d[row[names['rule_id']]], cpeakgroup_id=cpeakgroups_d[row[names['grpid']]], neutralmass_id=nm_d[row[names['nm_id']]]) ads.append(ad) Adduct.objects.bulk_create(ads)
def save_isotope_annotations(self): md = self.md cursor = self.cursor if not check_table_exists_sqlite(cursor, 'isotope_annotations'): return 0 cpeakgroups_d = { c.idi: c.pk for c in CPeakGroup.objects.filter(cpeakgroupmeta=self.cpgm) } cursor.execute('SELECT * FROM isotope_annotations') names = sql_column_names(cursor) isos = [] for row in cursor: if len(row) % 500 == 0: Isotope.objects.bulk_create(isos) isos = [] iso = Isotope( idi=row[names['iso_id']], iso=row[names['iso']], charge=row[names['charge']], cpeakgroup1_id=cpeakgroups_d[row[names['c_peak_group1_id']]], cpeakgroup2_id=cpeakgroups_d[row[names['c_peak_group2_id']]], metabinputdata=md) isos.append(iso) Isotope.objects.bulk_create(isos)
def save_xcms_group_peak_link(self): md = self.md cursor = self.cursor cursor.execute('SELECT * FROM c_peak_X_c_peak_group') names = sql_column_names(cursor) cpeakgrouplink = [] cpeakgroups_d = { c.idi: c.pk for c in CPeakGroup.objects.filter(cpeakgroupmeta=self.cpgm) } cpeaks_d = { c.idi: c.pk for c in CPeak.objects.filter(xcmsfileinfo__metabinputdata=md) } for row in cursor: if len(cpeakgrouplink) % 500 == 0: CPeakGroupLink.objects.bulk_create(cpeakgrouplink) cpeakgrouplink = [] cpeakgrouplink.append( CPeakGroupLink( cpeak_id=cpeaks_d[row[names['cid']]], cpeakgroup_id=cpeakgroups_d[row[names['grpid']]], best_feature=row[names['best_feature']] if 'best_feature' in names else None, )) CPeakGroupLink.objects.bulk_create(cpeakgrouplink) return cpeakgrouplink
def save_adduct_rules(self): md = self.md cursor = self.cursor if not check_table_exists_sqlite(cursor, 'adduct_rules'): return 0 # update adduct rules cursor.execute('SELECT * FROM adduct_rules') names = sql_column_names(cursor) addr = list(AdductRule.objects.filter().values('adduct_type', 'id')) if len(addr) > 0: addrd = {a['adduct_type']: a['id'] for a in addr} else: addrd = {} ruleset_d = {} for row in cursor: if row[names['name']] not in addrd: arulei = AdductRule(adduct_type=row[names['name']], nmol=row[names['nmol']], charge=row[names['charge']], massdiff=row[names['massdiff']], oidscore=row[names['oidscore']], quasi=row[names['quasi']], ips=row[names['ips']], frag_score=row[names['frag_score']] if 'frag_score' in names else None) arulei.save() ruleset_d[row[names['rule_id']]] = arulei.id else: ruleset_d[row[names['rule_id']]] = addrd[row[names['name']]] return ruleset_d
def save_spectral_matching_annotations(self): md = self.md cursor = self.cursor if not check_table_exists_sqlite(cursor, 'matches'): return 0 cursor.execute( 'SELECT * FROM matches LEFT JOIN library_meta ON matches.lid=library_meta.lid' ) names = sql_column_names(cursor) speakmeta_d = { c.idi: c.pk for c in SPeakMeta.objects.filter(metabinputdata=md) } library_d = { c.accession: c.pk for c in LibrarySpectraMeta.objects.all() } cpeakgroups_d = { c.idi: c.pk for c in CPeakGroup.objects.filter(cpeakgroupmeta=self.cpgm) } matches = [] for row in cursor: if len(matches) % 500 == 0: SpectralMatching.objects.bulk_create(matches) matches = [] if row[names['source_name']] in [ 'massbank', 'mona-experimental', 'lipidblast' ]: # Currently only works for mass bank (or anything from the experimental MONA library) try: lsm_id = library_d[row[names['accession']]] except KeyError as e: print(e) lsm_id = None match = SpectralMatching( idi=row[names['mid']], s_peak_meta_id=speakmeta_d[row[names['pid']]], score=row[names['score']], percentage_match=row[names['perc_mtch']], match_num=row[names['match']], accession=row[names['accession']], name=row[names['name']], library_spectra_meta_id=lsm_id) matches.append(match) SpectralMatching.objects.bulk_create(matches)
def upload_metplus(db_pth): # This is a quick way to get alot of important compounds. # The sqlite database is from https://github.com/ICBI/MetPlus-DB # It is 5 years old at the time of writing so it is potentially missing compounds so can't be completely relied # on. Also it does not contain all of PubChem so obviously we still have to add many compounds when we upload # the annotations that have used pubchem as the database conn = sqlite3.connect(db_pth) conn.text_factory = bytes cursor = conn.cursor() cursor.execute('SELECT * FROM MetPlus') names = sql_column_names(cursor) comps = [] cursor.next() # first row is header (strange for an sqlite database!) c = 0 for i, row in enumerate(cursor): if Compound.objects.filter(inchikey_id=row[names['INCHIKEY']]): continue if c > 1000: Compound.objects.bulk_create(comps) print(i) comps = [] c = 0 comp = Compound( inchikey_id=row[names['INCHIKEY']], exact_mass=row[names[ 'FORMULA']], # I think this formula is actually molecular weight!! molecular_formula=row[names[ 'MONOISOTOPIC_WEIGHTS']], # I think this formula is actually molecular weight!! iupac_name=row[names['IUPAC_NAME']].decode( 'utf-8', 'ignore').encode('utf-8'), systematic_name=row[names['SYSTEMATIC_NAME']].decode( 'utf-8', 'ignore').encode('utf-8'), name=row[names['COMMON_NAME']].decode('utf-8', 'ignore').encode('utf-8'), trade_name=row[names['TRADE_NAME']].decode( 'utf-8', 'ignore').encode('utf-8'), hmdb_id=row[names['HMDB_ID']], lmdb_id=row[names['LMDB_ID']], humancyc_id=row[names['HUMANCYC_ID']], pubchem_id=row[names['PUBCHEM_CID']], chemspider_id=row[names['CHEMSPIDER_ID']], chebi_id=row[names['CHEBI_ID']], metlin_id=row[names['METLIN_ID']], kegg_id=row[names['KEGG_ID']], foodb_id=row[names['FooDB_ID']], ) comps.append(comp) c += 1 Compound.objects.bulk_create(comps)
def save_speakmeta_cpeak_frag_link(self): md = self.md cursor = self.cursor CPeakGroupMeta = self.cpeakgroupmeta_class if not check_table_exists_sqlite(cursor, 'c_peak_X_s_peak_meta'): return 0 cursor.execute('SELECT * FROM c_peak_X_s_peak_meta') names = sql_column_names(cursor) speakmeta = SPeakMeta.objects.filter(metabinputdata=md) speakmeta_d = {s.idi: s.pk for s in speakmeta} cpeaks = CPeak.objects.filter(xcmsfileinfo__metabinputdata=md) cpeak_d = {s.idi: s.pk for s in cpeaks} speakmeta_cpeak_frag_links = [] for row in cursor: if len(speakmeta_cpeak_frag_links) % 500 == 0: SPeakMetaCPeakFragLink.objects.bulk_create( speakmeta_cpeak_frag_links) speakmeta_cpeak_frag_links = [] # this needs to be update after SQLite update in msPurity speakmeta_cpeak_frag_links.append( SPeakMetaCPeakFragLink( speakmeta_id=speakmeta_d[row[names['pid']]], cpeak_id=cpeak_d[row[names['cid']]], )) SPeakMetaCPeakFragLink.objects.bulk_create(speakmeta_cpeak_frag_links) # Add the number of msms events for grouped feature (not possible with django sql stuff) sqlstmt = '''UPDATE mbrowse_cpeakgroup t INNER JOIN ( (SELECT cpg.id, COUNT(cpgl.id) AS counter FROM mbrowse_cpeakgroup as cpg LEFT JOIN mbrowse_cpeakgrouplink as cpgl ON cpgl.cpeakgroup_id=cpg.id LEFT JOIN mbrowse_speakmetacpeakfraglink as scfl ON cpgl.cpeak_id=scfl.cpeak_id WHERE scfl.id is not NULL AND cpg.cpeakgroupmeta_id={} group by cpg.id) ) m ON t.id = m.id SET t.msms_count = m.counter'''.format( self.cpgm.id) with connection.cursor() as cursor: cursor.execute(sqlstmt)
def save_s_peak_meta(self, runs, celery_obj): md = self.md cursor = self.cursor cursor.execute('SELECT * FROM s_peak_meta') names = sql_column_names(cursor) speakmetas = [] for row in cursor: # this needs to be update after SQLite update in msPurity # to stop ram memory runnning out if len(speakmetas) % 500 == 0: if celery_obj: celery_obj.update_state(state='RUNNING', meta={ 'current': 10, 'total': 100, 'status': 'Upload scan peak {}'.format( len(speakmetas)) }) SPeakMeta.objects.bulk_create(speakmetas) speakmetas = [] speakmetas.append( SPeakMeta(run=runs[row[names['fileid']]], idi=row[names['pid']], precursor_mz=row[names['precursorMZ']], precursor_i=row[names['precursorIntensity']], precursor_rt=row[names['precursorRT']], precursor_scan_num=row[names['precursorScanNum']], precursor_nearest=row[names['precursorNearest']], scan_num=row[names['precursorScanNum']], a_mz=row[names['aMz']], a_purity=row[names['aPurity']], a_pknm=row[names['apkNm']], i_mz=row[names['iMz']], i_purity=row[names['iPurity']], i_pknm=row[names['ipkNm']], in_purity=row[names['inPurity']], in_pknm=row[names['inPkNm']], ms_level=2, metabinputdata=md)) SPeakMeta.objects.bulk_create(speakmetas)
def save_eics(self): md = self.md cursor = self.cursor if not check_table_exists_sqlite(cursor, 'eics'): return 0 cursor.execute('SELECT * FROM eics') names = sql_column_names(cursor) eicmeta = EicMeta(metabinputdata=md) eicmeta.save() cpeaks_d = { c.idi: c.pk for c in CPeak.objects.filter(xcmsfileinfo__metabinputdata=md) } cpeakgroups_d = { c.idi: c.pk for c in CPeakGroup.objects.filter(cpeakgroupmeta=self.cpgm) } eics = [] c = 0 for row in cursor: if c >= 1000: # to save memory Eic.objects.bulk_create(eics) eics = [] c = 0 eic = Eic( idi=row[names['eicidi']], scan=row[names['scan']], intensity=row[names['intensity']], rt_raw=row[names['rt_raw']], rt_corrected=row[names['rt_corrected']] if 'rt_corrected' in names else None, purity=row[names['purity']] if 'purity' in names else None, cpeak_id=cpeaks_d[row[names['c_peak_id']]], cpeakgroup_id=cpeakgroups_d[row[names['grpid']]], eicmeta_id=eicmeta.id) eics.append(eic) c += 1 Eic.objects.bulk_create(eics)
def save_s_peaks(self, celery_obj): md = self.md cursor = self.cursor if not check_table_exists_sqlite(cursor, 's_peaks'): return 0 speakmeta = SPeakMeta.objects.filter(metabinputdata=md) speakmeta_d = {s.idi: s.pk for s in speakmeta} cursor.execute('SELECT * FROM s_peaks') names = sql_column_names(cursor) speaks = [] for row in cursor: speaks.append( SPeak(speakmeta_id=speakmeta_d[row[names['pid']]], mz=row[names['mz']], i=row[names['i']])) # to stop ram memory runnning out if len(speaks) > 1000: SPeak.objects.bulk_create(speaks) if celery_obj: celery_obj.update_state(state='RUNNING', meta={ 'current': 10, 'total': 100, 'status': 'Scan peaks upload, {}'.format( len(speaks)) }) speaks = [] if speaks: print('saving speak objects') SPeak.objects.bulk_create(speaks)
def save_neutral_masses(self): md = self.md cursor = self.cursor if not check_table_exists_sqlite(cursor, 'neutral_masses'): return 0 # update neutral mass cursor.execute('SELECT * FROM neutral_masses') names = sql_column_names(cursor) nms = [] for row in cursor: if len(row) % 500 == 0: NeutralMass.objects.bulk_create(nms) nms = [] nm = NeutralMass(idi=row[names['nm_id']], nm=row[names['mass']], ips=row[names['ips']], metabinputdata=md) nms.append(nm) NeutralMass.objects.bulk_create(nms)
def save_sirius_csifingerid(self, celery_obj, csi_speed=True): md = self.md cursor = self.cursor if not check_table_exists_sqlite(cursor, 'sirius_csifingerid_results'): return 0 cursor.execute('SELECT * FROM sirius_csifingerid_results') names = sql_column_names(cursor) speakmeta_d = { c.idi: c.pk for c in SPeakMeta.objects.filter(metabinputdata=md) } speaks = [] matches = [] meta = CSIFingerIDMeta() meta.save() comp_d = {} UID_old = '' for i, row in enumerate(cursor): UID = row[names['UID']] if UID == 'UID': continue uid_l = UID.split('-') pid = uid_l[2] try: rank = int(row[names['Rank']]) except ValueError as e: print(e) continue if rank > 6: continue if TEST_MODE: if i > 3000: break if celery_obj and i % 500 == 0: celery_obj.update_state( state='RUNNING', meta={ 'current': 80, 'total': 100, 'status': 'SIRIUS CSI-FingerID upload, annotation {}'.format(i) }) if UID_old and not UID == UID_old: print(i) print(UID_old, UID) matches = self.rank_score_sirius(matches) CSIFingerIDAnnotation.objects.bulk_create(matches) matches = [] UID_old = UID match = CSIFingerIDAnnotation( idi=i + 1, s_peak_meta_id=speakmeta_d[int(pid)], inchikey2d=row[names['InChIkey2D']], molecular_formula=row[names['molecularFormula']], rank=rank, score=row[names['Score']], name=row[names['Name']], links=row[names['links']], smiles=row[names['smiles']], csifingeridmeta=meta) matches.append(match) # match.compound.add(*comps) speaks.append(speakmeta_d[int(pid)]) matches = self.rank_score_sirius(matches) CSIFingerIDAnnotation.objects.bulk_create(matches)
def save_probmetab(self, celery_obj): md = self.md cursor = self.cursor if not check_table_exists_sqlite(cursor, 'probmetab_results'): return 0 cursor.execute('SELECT * FROM probmetab_results') names = sql_column_names(cursor) cpeakgroups_d = { c.idi: c.pk for c in CPeakGroup.objects.filter(cpeakgroupmeta=self.cpgm) } matches = [] for c, row in enumerate(cursor): if TEST_MODE: if c > 500: break if not row[names['grp_id']]: continue if celery_obj and len(matches) % 100 == 0: celery_obj.update_state( state='RUNNING', meta={ 'current': 70, 'total': 100, 'status': 'Probabmetab upload, annotation {}'.format(c) }) if len(matches) % 500 == 0: ProbmetabAnnotation.objects.bulk_create(matches) matches = [] # Expect to have majority of KEGG in the Compound model already kegg_id = row[names['mpc']].split(':')[1] comp_search = Compound.objects.filter( kegg_id__regex='(^|.*,|")({})("|,.*|$)'.format(kegg_id) ) # this needs to be update to be proper relational as the regex fails in some cases! if comp_search: comp = comp_search[0] else: kegg_compound = get_kegg_compound(kegg_id) if 'chebi_id_single' in kegg_compound and kegg_compound[ 'chebi_id_single']: inchikey = get_inchi_from_chebi( kegg_compound['chebi_id_single']) if inchikey: kegg_compound['inchikey_id'] = inchikey comp = save_compound_kegg(kegg_compound) match = ProbmetabAnnotation(idi=c + 1, cpeakgroup_id=cpeakgroups_d[int( row[names['grp_id']])], compound=comp, prob=row[names['proba']]) matches.append(match) ProbmetabAnnotation.objects.bulk_create(matches)
def save_metfrag(self, celery_obj): md = self.md cursor = self.cursor if not check_table_exists_sqlite(cursor, 'metfrag_results'): return 0 cursor.execute('SELECT * FROM metfrag_results') names = sql_column_names(cursor) speakmeta_d = { c.idi: c.pk for c in SPeakMeta.objects.filter(metabinputdata=md) } matches = [] for i, row in enumerate(cursor): if TEST_MODE: if i > 500: break UID = row[names['UID']] if UID == 'UID': # duplicate header name continue uid_l = UID.split('-') pid = uid_l[2] if not row[names['InChIKey']]: # currently only add compounds we can have a name for (should be all cases if PubChem was used) continue try: score = float(row[names['Score']]) except ValueError as e: print(e) continue if score < 0.6: # no point storing anything less than 0.6 continue if celery_obj and len(matches) % 100 == 0: celery_obj.update_state( state='RUNNING', meta={ 'current': 50, 'total': 100, 'status': 'Metfrag upload, annotation {}'.format(i) }) if len(matches) % 100 == 0: print(i) MetFragAnnotation.objects.bulk_create(matches) matches = [] inchikey = row[names['InChIKey']] identifier = row[names['Identifier']] comp_search = Compound.objects.filter(inchikey_id=inchikey) # if comp_search: # comp = comp_search[0] # else: # comp = Compound(inchikey_id=inchikey, # name=row[names['CompoundName']] if row[names['CompoundName']] else '', # molecular_formula=row[names['MolecularFormula']], # exact_mass=row[names['MonoisotopicMass']], # monoisotopic_mass=row[names['MonoisotopicMass']], # smiles=row[names['SMILES']], # pubchem_id=identifier # ) # comp.save() # Takes too long to search pubchem if comp_search: comp = comp_search[0] else: print('CHECK LOCALLY') comp = get_pubchem_sqlite_local(identifier) if not comp: print('CHECK CID') pc_matches = get_pubchem_compound(identifier, 'cid') if not pc_matches: print('CHECK INCHIKEY') pc_matches = get_pubchem_compound(inchikey, 'inchikey') if not pc_matches: print(row) print(pc_matches) print(inchikey) continue if len(pc_matches) > 1: print('More than 1 match for inchi, taking the first match, should only really happen in rare cases' \ 'and we have not got the power to distinguish between them anyway!') pc_match = pc_matches[0] comp = create_pubchem_comp(pc_match) comp.save() match = MetFragAnnotation( idi=i + 1, s_peak_meta_id=speakmeta_d[int(pid)], compound=comp, explained_peaks=row[names['ExplPeaks']], formula_explained_peaks=row[names['FormulasOfExplPeaks']], fragmentor_score=row[names['FragmenterScore']], fragmentor_score_values=row[names['FragmenterScore_Values']], maximum_tree_depth=row[names['MaximumTreeDepth']], number_peaks_used=row[names['NumberPeaksUsed']], score=row[names['Score']]) matches.append(match) MetFragAnnotation.objects.bulk_create(matches)
def save_xcms_file_info(self): md = self.md cursor = self.cursor mfiles = self.mfiles if check_table_exists_sqlite(cursor, 'xset_classes'): cursor.execute('SELECT * FROM xset_classes') names = sql_column_names(cursor) xset_classes = {} for row in self.cursor: xset_classes[row[names['row_names']]] = row[names['class']] else: xset_classes = {} cursor.execute('SELECT * FROM fileinfo') names = sql_column_names(cursor) xfi_d = {} mfile_d = {} for row in self.cursor: idi = row[names['fileid']] fn = row[names['filename']] if xset_classes: sampleType = xset_classes[os.path.splitext(fn)[0]] else: # old database schema has this stored in the same table sampleType = row[names['sampleclass']] mfile_qs = mfiles.filter(original_filename=fn) if mfile_qs: mfile = mfile_qs[0] else: # add the file with the most basic of information prefix, suffix = os.path.splitext(fn) if re.match('.*(?:_POS_|_POSITIVE_).*', prefix): polarity_qs = Polarity.objects.filter(polarity='positive') elif re.match('.*(?:_NEG_|_NEGATIVE_).*', prefix): polarity_qs = Polarity.objects.filter(polarity='positive') else: polarity_qs = Polarity.objects.filter(polarity='unknown') if polarity_qs: run = Run(prefix=prefix, polarity=polarity_qs[0]) else: run = Run(prefix=prefix) run.save() mfile = MFile( original_filename=fn, run=run, mfilesuffix=MFileSuffix.objects.filter(suffix=suffix)[0]) mfile.save() xfi = XCMSFileInfo(idi=idi, filename=fn, classname=sampleType, mfile=mfile, metabinputdata=md) xfi.save() xfi_d[idi] = xfi mfile_d[idi] = mfile return xfi_d, mfile_d