コード例 #1
0
def test_studies(db):
    study = Study(pmid=345345, title='test study',
        authors='Jokkin, Eumast',
        journal='Journal of Nonexistent Findings',
        year=2008)
    study.peaks = [Peak(x=-12, y=14, z=40), Peak(x=22, y=22, z=22)]
    db.session.add(study)
    db.session.commit()
    assert Peak.query.count() == 2
    assert Study.query.count() == 1
コード例 #2
0
    def add_studies(self, analyses=None, threshold=0.001, limit=None,
                    reset=False):
        """ Add studies to the DB.
        Args:
            analyses: list of names of analyses to map studies onto. If None,
                use all available.
            threshold: Float or integer; minimum value in AnalysisTable data
                array for inclusion.
            limit: integer; maximum number of studies to add (order will be
                randomized).
            reset: Drop all existing records before populating.
        Notes:
            By default, will not create new Study records if an existing one
            matches. This ensures that we can gracefully add new analysis
            associations without mucking up the DB. To explicitly replace old
            records, pass reset=True.
        """
        if reset:
            Study.query.delete()

        # For efficiency, get all analysis data up front, so we only need to
        # densify array once
        if analyses is None:
            analyses = self._get_feature_names()

        feature_data = self.dataset.get_feature_data(features=analyses)

        study_inds = self.dataset.activations['id'].unique()

        if limit is not None:
            random.shuffle(study_inds)
            study_inds = study_inds[:limit]

        # SQL DBs generally don't like numpy dtypes
        study_inds = [int(ind) for ind in study_inds]

        all_rows = self.dataset.activations.query('id in @study_inds')
        all_rows[['doi', 'table_num']] = all_rows[['doi', 'table_num']] \
                                            .astype(str).replace('nan', '')

        # Create Study records
        for i, pmid in enumerate(study_inds):

            activ = all_rows.query('id == @pmid')

            study = Study.query.get(pmid)

            if study is None:
                peaks = [Peak(x=p['x'], y=p['y'], z=p['z'],
                              table=p['table_num'])
                         for (ind, p) in activ.iterrows()]

                # Track in Python to avoid issuing SQL count() queries
                n_peaks = len(peaks)

                data = activ.iloc[0, :]
                study = Study(
                    pmid=int(pmid),
                    space=data['space'],
                    doi=data['doi'],
                    title=data['title'],
                    journal=data['journal'],
                    authors=data['authors'],
                    year=int(data['year']))
                study.peaks.extend(peaks)
                self.db.session.add(study)

            # Map analyses onto studies via a Frequency join table that also
            # stores frequency info
            pmid_frequencies = feature_data.loc[pmid, :]
            to_keep = pmid_frequencies[pmid_frequencies >= threshold]
            for analysis_name, freq in to_keep.iteritems():
                freq_inst = Frequency(
                    study=study, analysis=self.analyses[analysis_name][0],
                    frequency=freq)
                self.db.session.add(freq_inst)

                # Track number of studies and peaks so we can update
                # Analysis table more efficiently later
                self.analyses[analysis_name][1] += 1
                self.analyses[analysis_name][2] += n_peaks

        # Commit records in batches to conserve memory and speed up querying.
            if (i + 1) % 100 == 0:
                print("Saving study %d..." % i)
                self.db.session.commit()

        self.db.session.commit()  # Commit any remaining studies

        # Update all analysis counts
        self._update_analysis_counts()
コード例 #3
0
    def add_studies(self, analyses=None, threshold=0.001, limit=None,
                    reset=False):
        """ Add studies to the DB.
        Args:
            analyses: list of names of analyses to map studies onto. If None,
                use all available.
            threshold: Float or integer; minimum value in AnalysisTable data
                array for inclusion.
            limit: integer; maximum number of studies to add (order will be
                randomized).
            reset: Drop all existing records before populating.
        Notes:
            By default, will not create new Study records if an existing one
            matches. This ensures that we can gracefully add new analysis
            associations without mucking up the DB. To explicitly replace old
            records, pass reset=True.
        """
        if reset:
            Study.query.delete()

        # For efficiency, get all analysis data up front, so we only need to
        # densify array once
        if analyses is None:
            analyses = self._get_feature_names()

        feature_data = self.dataset.get_feature_data(features=analyses)
        analysis_names = list(feature_data.columns)

        study_inds = range(len(self.dataset.mappables))
        if limit is not None:
            random.shuffle(study_inds)
            study_inds = study_inds[:limit]

        # Create Study records
        for i in study_inds:

            m = self.dataset.mappables[i]
            id = int(m.id)

            study = Study.query.get(id)
            if study is None:
                peaks = [Peak(x=float(p.x),
                              y=float(p.y),
                              z=float(p.z),
                              table=str(p.table_num).replace('nan', '')
                              ) for (ind, p) in m.data.iterrows()]
                data = m.data.iloc[0]
                study = Study(
                    pmid=id,
                    space=data['space'],
                    doi=str(data['doi']).replace('nan', ''),
                    title=data['title'],
                    journal=data['journal'],
                    authors=data['authors'],
                    year=data['year'])
                study.peaks.extend(peaks)
                self.db.session.add(study)

            # Map analyses onto studies via a Frequency join table that also
            # stores frequency info
            pmid_frequencies = list(feature_data.ix[m.id, :])

            for (y, analysis_name) in enumerate(analysis_names):
                freq = pmid_frequencies[y]
                if pmid_frequencies[y] >= threshold:
                    freq_inst = Frequency(
                        study=study, analysis=self.analyses[analysis_name][0],
                        frequency=freq)
                    self.db.session.add(freq_inst)

                    # Track number of studies and peaks so we can update
                    # Analysis table more efficiently later
                    self.analyses[analysis_name][1] += 1
                    self.analyses[analysis_name][2] += study.peaks.count()

        # Commit records in batches to conserve memory.
        # This is very slow because we're relying on the declarative base.
        # Ideally should replace this with use of SQLAlchemy core, but probably
        # not worth the trouble considering we only re-create the DB once in a
        # blue moon.
            if (i + 1) % 100 == 0:
                self.db.session.commit()

        self.db.session.commit()  # Commit any remaining studies

        # Update all analysis counts
        self._update_analysis_counts()