def test_studies(db): study = Study(pmid=345345, title='test study', authors='Jokkin, Eumast', journal='Journal of Nonexistent Findings', year=2008) study.peaks = [Peak(x=-12, y=14, z=40), Peak(x=22, y=22, z=22)] db.session.add(study) db.session.commit() assert Peak.query.count() == 2 assert Study.query.count() == 1
def add_studies(self, analyses=None, threshold=0.001, limit=None, reset=False): """ Add studies to the DB. Args: analyses: list of names of analyses to map studies onto. If None, use all available. threshold: Float or integer; minimum value in AnalysisTable data array for inclusion. limit: integer; maximum number of studies to add (order will be randomized). reset: Drop all existing records before populating. Notes: By default, will not create new Study records if an existing one matches. This ensures that we can gracefully add new analysis associations without mucking up the DB. To explicitly replace old records, pass reset=True. """ if reset: Study.query.delete() # For efficiency, get all analysis data up front, so we only need to # densify array once if analyses is None: analyses = self._get_feature_names() feature_data = self.dataset.get_feature_data(features=analyses) study_inds = self.dataset.activations['id'].unique() if limit is not None: random.shuffle(study_inds) study_inds = study_inds[:limit] # SQL DBs generally don't like numpy dtypes study_inds = [int(ind) for ind in study_inds] all_rows = self.dataset.activations.query('id in @study_inds') all_rows[['doi', 'table_num']] = all_rows[['doi', 'table_num']] \ .astype(str).replace('nan', '') # Create Study records for i, pmid in enumerate(study_inds): activ = all_rows.query('id == @pmid') study = Study.query.get(pmid) if study is None: peaks = [Peak(x=p['x'], y=p['y'], z=p['z'], table=p['table_num']) for (ind, p) in activ.iterrows()] # Track in Python to avoid issuing SQL count() queries n_peaks = len(peaks) data = activ.iloc[0, :] study = Study( pmid=int(pmid), space=data['space'], doi=data['doi'], title=data['title'], journal=data['journal'], authors=data['authors'], year=int(data['year'])) study.peaks.extend(peaks) self.db.session.add(study) # Map analyses onto studies via a Frequency join table that also # stores frequency info pmid_frequencies = feature_data.loc[pmid, :] to_keep = pmid_frequencies[pmid_frequencies >= threshold] for analysis_name, freq in to_keep.iteritems(): freq_inst = Frequency( study=study, analysis=self.analyses[analysis_name][0], frequency=freq) self.db.session.add(freq_inst) # Track number of studies and peaks so we can update # Analysis table more efficiently later self.analyses[analysis_name][1] += 1 self.analyses[analysis_name][2] += n_peaks # Commit records in batches to conserve memory and speed up querying. if (i + 1) % 100 == 0: print("Saving study %d..." % i) self.db.session.commit() self.db.session.commit() # Commit any remaining studies # Update all analysis counts self._update_analysis_counts()
def add_studies(self, analyses=None, threshold=0.001, limit=None, reset=False): """ Add studies to the DB. Args: analyses: list of names of analyses to map studies onto. If None, use all available. threshold: Float or integer; minimum value in AnalysisTable data array for inclusion. limit: integer; maximum number of studies to add (order will be randomized). reset: Drop all existing records before populating. Notes: By default, will not create new Study records if an existing one matches. This ensures that we can gracefully add new analysis associations without mucking up the DB. To explicitly replace old records, pass reset=True. """ if reset: Study.query.delete() # For efficiency, get all analysis data up front, so we only need to # densify array once if analyses is None: analyses = self._get_feature_names() feature_data = self.dataset.get_feature_data(features=analyses) analysis_names = list(feature_data.columns) study_inds = range(len(self.dataset.mappables)) if limit is not None: random.shuffle(study_inds) study_inds = study_inds[:limit] # Create Study records for i in study_inds: m = self.dataset.mappables[i] id = int(m.id) study = Study.query.get(id) if study is None: peaks = [Peak(x=float(p.x), y=float(p.y), z=float(p.z), table=str(p.table_num).replace('nan', '') ) for (ind, p) in m.data.iterrows()] data = m.data.iloc[0] study = Study( pmid=id, space=data['space'], doi=str(data['doi']).replace('nan', ''), title=data['title'], journal=data['journal'], authors=data['authors'], year=data['year']) study.peaks.extend(peaks) self.db.session.add(study) # Map analyses onto studies via a Frequency join table that also # stores frequency info pmid_frequencies = list(feature_data.ix[m.id, :]) for (y, analysis_name) in enumerate(analysis_names): freq = pmid_frequencies[y] if pmid_frequencies[y] >= threshold: freq_inst = Frequency( study=study, analysis=self.analyses[analysis_name][0], frequency=freq) self.db.session.add(freq_inst) # Track number of studies and peaks so we can update # Analysis table more efficiently later self.analyses[analysis_name][1] += 1 self.analyses[analysis_name][2] += study.peaks.count() # Commit records in batches to conserve memory. # This is very slow because we're relying on the declarative base. # Ideally should replace this with use of SQLAlchemy core, but probably # not worth the trouble considering we only re-create the DB once in a # blue moon. if (i + 1) % 100 == 0: self.db.session.commit() self.db.session.commit() # Commit any remaining studies # Update all analysis counts self._update_analysis_counts()