Ejemplos de S3Path en Python, ejemplos de indra_db.util.S3Path en Python

Ejemplo n.º 1

0

Mostrar archivo

def _upload_bytes_io_to_s3(bytes_io_obj: BytesIO, s3p: S3Path):
    """Upload a BytesIO object to s3

    Parameters
    ----------
    bytes_io_obj : BytesIO
        Object to upload
    s3p : S3Path
        An S3Path instance of the full upload url
    """
    logger.info(f"Uploading BytesIO object to s3: {str(s3p)}")
    bytes_io_obj.seek(0)  # Just in case
    s3 = get_s3_client(unsigned=False)
    s3p.put(body=bytes_io_obj, s3=s3)

Ejemplo n.º 2

0

Mostrar archivo

def _joinpath(fpath: Union[S3Path, Path], other: str) -> Union[S3Path, Path]:
    if isinstance(fpath, Path):
        return fpath.joinpath(other).absolute()
    else:
        if (fpath.to_string().endswith("/") and not other.startswith("/") or
                not fpath.to_string().endswith("/") and other.startswith("/")):
            return S3Path.from_string(fpath.to_string() + other)
        elif fpath.to_string().endswith("/") and other.startswith("/"):
            return S3Path.from_string(fpath.to_string() + other[1:])
        elif not fpath.to_string().endswith("/") and not other.startswith("/"):
            return S3Path.from_string(fpath.to_string() + "/" + other)
        else:
            raise ValueError(f"Unable to join {fpath.to_string()} and "
                             f'{other} with "/"')

Ejemplo n.º 3

0

Mostrar archivo

    def pg_restore(self, dump_file, **options):
        """Load content into the database from a dump file on s3."""
        if isinstance(dump_file, str):
            dump_file = S3Path.from_string(dump_file)
        elif dump_file is not None and not isinstance(dump_file, S3Path):
            raise ValueError("Argument `dump_file` must be appropriately "
                             "formatted string or S3Path object, not %s."
                             % type(dump_file))

        from subprocess import run
        from os import environ

        self.session.close()
        self.grab_session()

        # Add the password to the env
        my_env = environ.copy()
        my_env['PGPASSWORD'] = self.url.password

        # Pipe the database dump from s3 through this machine into the database
        logger.info("Dumping into the database.")
        option_list = [f'--{opt}' if isinstance(val, bool) and val
                       else f'--{opt}={val}' for opt, val in options.items()]
        run(' '.join(['aws', 's3', 'cp', dump_file.to_string(), '-', '|',
                      'pg_restore', *self._form_pg_args(), *option_list,
                      '--no-owner']),
            env=my_env, shell=True, check=True)
        self.session.close()
        self.grab_session()
        return dump_file

Ejemplo n.º 4

0

Mostrar archivo

def _get_preassembler():
    s3 = boto3.client('s3')
    test_ontology_path = S3Path(bucket='bigmech',
                                key='travis/bio_ontology/1.4/mock_ontology.pkl')
    test_ontology = pickle.loads(test_ontology_path.get(s3)['Body'].read())
    print("Loaded test ontology.")
    return DbPreassembler(yes_all=True, ontology=test_ontology)

Ejemplo n.º 5

0

Mostrar archivo

def _main():
    parser = _make_parser()
    args = parser.parse_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
        from indra_db.databases import logger as db_logger
        db_logger.setLevel(logging.DEBUG)
    print("Getting %s database." % args.database)
    db = get_db(args.database)
    assert db is not None
    db.grab_session()
    s3_cache = S3Path.from_string(args.cache)
    pa = DbPreassembler(args.batch,
                        s3_cache,
                        stmt_type=args.stmt_type,
                        yes_all=args.yes_all)

    desc = 'Continuing' if args.continuing else 'Beginning'
    print("%s to %s preassembled corpus." % (desc, args.task))
    if args.task == 'create':
        pa.create_corpus(db, args.continuing)
    elif args.task == 'update':
        pa.supplement_corpus(db, args.continuing)
    else:
        raise IndraDBPreassemblyError('Unrecognized task: %s.' % args.task)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: dump_manager.py Proyecto: kolusask/indra_db

def list_dumps():
    s3_base = get_s3_dump()
    s3 = boto3.client('s3')
    res = s3.list_objects_v2(Delimiter='/', **s3_base.kw(prefix=True))
    return [
        S3Path.from_key_parts(s3_base.bucket, d['Prefix'])
        for d in res['CommonPrefixes']
    ]

Ejemplo n.º 7

0

Mostrar archivo

    def get_s3_path(self) -> S3Path:
        """Return an S3Path object of the saved s3 location

        Returns
        -------
        S3Path
        """
        if self.s3_location is None:
            raise ValueError("s3_location is not set")
        return S3Path.from_string(self.s3_location)

Ejemplo n.º 8

0

Mostrar archivo

def list_dumps(started=None, ended=None):
    """List all dumps, optionally filtered by their status.

    Parameters
    ----------
    started : Optional[bool]
        If True, find dumps that have started. If False, find dumps that have
        NOT been started. If None, do not filter by start status.
    ended : Optional[bool]
        The same as `started`, but checking whether the dump is ended or not.

    Returns
    -------
    list of S3Path objects
        Each S3Path object contains the bucket and key prefix information for
        a set of dump files, e.g.

            [S3Path(bigmech, indra-db/dumps/2020-07-16/),
             S3Path(bigmech, indra-db/dumps/2020-08-28/),
             S3Path(bigmech, indra-db/dumps/2020-09-18/),
             S3Path(bigmech, indra-db/dumps/2020-11-12/),
             S3Path(bigmech, indra-db/dumps/2020-11-13/)]
    """
    # Get all the dump "directories".
    s3_base = get_s3_dump()
    s3 = boto3.client('s3')
    res = s3.list_objects_v2(Delimiter='/', **s3_base.kw(prefix=True))
    if res['KeyCount'] == 0:
        return []
    dumps = [
        S3Path.from_key_parts(s3_base.bucket, d['Prefix'])
        for d in res['CommonPrefixes']
    ]

    # Filter to those that have "started"
    if started is not None:
        dumps = [
            p for p in dumps
            if p.get_element_path(Start.file_name()).exists(s3) == started
        ]

    # Filter to those that have "ended"
    if ended is not None:
        dumps = [
            p for p in dumps
            if p.get_element_path(End.file_name()).exists(s3) == ended
        ]

    return dumps

Ejemplo n.º 9

0

Mostrar archivo

    def pg_dump(self, dump_file, **options):
        """Use the pg_dump command to dump part of the database onto s3.

        The `pg_dump` tool must be installed, and must be a compatible version
        with the database(s) being used.

        All keyword arguments are converted into flags/arguments of pg_dump. For
        documentation run `pg_dump --help`. This will also confirm you have
        `pg_dump` installed.

        By default, the "General" and "Connection" options are already set. The
        most likely specification you will want to use is `--table` or
        `--schema`, specifying either a particular table or schema to dump.

        Parameters
        ----------
        dump_file : S3Path or str
            The location on s3 where the content should be dumped.
        """
        if isinstance(dump_file, str):
            dump_file = S3Path.from_string(dump_file)
        elif dump_file is not None and not isinstance(dump_file, S3Path):
            raise ValueError("Argument `dump_file` must be appropriately "
                             "formatted string or S3Path object, not %s."
                             % type(dump_file))

        from subprocess import check_call
        from os import environ

        # Make sure the session is fresh and any previous session are done.
        self.session.close()
        self.grab_session()

        # Add the password to the env
        my_env = environ.copy()
        my_env['PGPASSWORD'] = self.url.password

        # Dump the database onto s3, piping through this machine (errors if
        # anything went wrong).
        option_list = [f'--{opt}' if isinstance(val, bool) and val
                       else f'--{opt}={val}' for opt, val in options.items()]
        cmd = ' '.join(["pg_dump", *self._form_pg_args(), *option_list, '-Fc',
                        '|', 'aws', 's3', 'cp', '-', dump_file.to_string()])
        check_call(cmd, shell=True, env=my_env)
        return dump_file

Ejemplo n.º 10

0

Mostrar archivo

    def get_latest_dump_file():
        import boto3
        from indra.util.aws import iter_s3_keys
        from indra_db.config import get_s3_dump

        s3 = boto3.client('s3')
        s3_path = get_s3_dump()

        logger.debug("Looking for the latest dump file on s3 to %s." % s3_path)

        # Get the most recent file from s3.
        max_date_str = None
        max_lm_date = None
        latest_key = None
        for key, lm_date in iter_s3_keys(s3, with_dt=True, **s3_path.kw()):

            # Get the date string from the name, ignoring non-standard files.
            suffix = key.split('/')[-1]
            m = re.match('readonly-(\S+).dump', suffix)
            if m is None:
                logger.debug("{key} is not a standard key, will not be "
                             "considered.".format(key=key))
                continue
            date_str, = m.groups()

            # Compare the the current maxes. If the date_str and the last
            # -modified date don't agree, raise an error.
            if not max_lm_date \
                    or date_str > max_date_str and lm_date > max_lm_date:
                max_date_str = date_str
                max_lm_date = lm_date
                latest_key = key
            elif max_lm_date \
                    and (date_str > max_date_str or lm_date > max_lm_date):
                raise S3DumpTimeAmbiguityError(key, date_str > max_date_str,
                                               lm_date > max_lm_date)
        logger.debug("Latest dump file from %s was found to be %s."
                     % (s3_path, latest_key))

        return S3Path(s3_path.bucket, latest_key)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: xdd.py Proyecto: pagreene/indra_db

def _get_file_pairs_from_group(s3, group: S3Path):
    files = group.list_objects(s3)
    file_pairs = defaultdict(dict)
    got_all = True
    for file_path in files:
        # Get information from the filename, including the cases with and
        # without the id_src label.
        parts = file_path.key.split('_')
        if len(parts) == 2:
            run_id, file_suffix = parts
            id_src = None
        elif len(parts) == 3:
            run_id, id_src, file_suffix = parts
        else:
            raise XDDFileError(f"XDD file does not match known standards: "
                               f"{file_path.key}")
        file_type = file_suffix.split('.')[0]

        # Try getting the file
        try:
            file_obj = s3.get_object(**file_path.kw())
            file_json = json.loads(file_obj['Body'].read())
            file_pairs[(run_id, id_src)][file_type] = file_json
        except Exception as e:
            logger.error(f"Failed to load {file_path}")
            logger.exception(e)
            if run_id in file_pairs:
                del file_pairs[run_id]
            got_all = False

    # Create a dict of tuples from the pairs of files.
    ret = {}
    for batch_id, files in file_pairs.items():
        if len(files) != 2 or 'bib' not in files or 'stmts' not in files:
            logger.warning(f"Run {batch_id} does not have both 'bib' and "
                           f"'stmts' in files: {files.keys()}. Skipping.")
            got_all = False
            continue
        ret[batch_id] = (files['bib'], files['stmts'])
    return ret, got_all

Ejemplo n.º 12

0

Mostrar archivo

    def plot_interesting(
        self,
        outdir: str,
        z_corr: Optional[Union[str, pd.DataFrame]] = None,
        show_plot: Optional[bool] = False,
        max_proc: Optional[int] = None,
        index_counter: Optional[Union[Iterator, Generator]] = None,
        max_so_pairs_size: int = 10000,
        mp_pairs: bool = True,
        run_linear: bool = False,
        log_scale_y: bool = False,
    ):
        """Plots the same type of plot as plot_dists, but filters A, B

        A, B are filtered to those that fulfill the following:
            - No a-b or b-a explanations
            - Not explained by apriori explanations
            - Without common reactome pathways
            - With a-x-b, b-x-a or shared target explanation

        Parameters
        ----------
        outdir : str
            The output directory to save the plots in. If string starts with
            's3://' upload to s3. outdir must then have the form
            's3://<bucket>/<sub_dir>' where <bucket> must be specified and
            <sub_dir> is optional and may contain subdirectories.
        z_corr : Union[str, pd.DataFrame]
            A pd.DataFrame containing the correlation z scores used to
            create the statistics in this object. If not provided,
            an attempt will be made to load it from the file path present in
            script_settings.
        show_plot : bool
            If True also show plots
        max_proc : int > 0
            The maximum number of processes to run in the multiprocessing in
            get_corr_stats_mp. Default: multiprocessing.cpu_count()
        index_counter : Union[Iterator, Generator]
            An object which produces a new int by using 'next()' on it. The
            integers are used to separate the figures so as to not append
            new plots in the same figure.
        max_so_pairs_size : int
            The maximum number of correlation pairs to process. If the
            number of eligible pairs is larger than this number, a random
            sample of max_so_pairs_size is used. Default: 10000.
        mp_pairs : bool
            If True, get the pairs to process using multiprocessing if larger
            than 10 000. Default: True.
        run_linear : bool
            If True, gather the data without multiprocessing. This option is
            good when debugging or if the environment for some reason does
            not support multiprocessing. Default: False.
        log_scale_y : bool
            If True, plot the plots in this method with log10 scale on y-axis.
            Default: False.
        """
        # Local file or s3
        if outdir.startswith("s3://"):
            s3_path = S3Path.from_string(outdir)
            od = None
        else:
            s3_path = None
            od = Path(outdir)
            if not od.is_dir():
                od.mkdir(parents=True, exist_ok=True)

        # Get corr stats
        corr_stats: Results = self.get_corr_stats_axb(
            z_corr=z_corr,
            max_proc=max_proc,
            max_so_pairs_size=max_so_pairs_size,
            mp_pairs=mp_pairs,
            run_linear=run_linear,
        )
        fig_index = (next(index_counter) if index_counter else floor(
            datetime.timestamp(datetime.utcnow())))
        plt.figure(fig_index)
        plt.hist(
            corr_stats.azfb_avg_corrs,
            bins="auto",
            density=True,
            color="b",
            alpha=0.3,
            log=log_scale_y,
        )
        plt.hist(
            corr_stats.avg_x_filtered_corrs,
            bins="auto",
            density=True,
            color="r",
            alpha=0.3,
            log=log_scale_y,
        )
        legend = [
            "Filtered A-X-B for any X", "Filtered A-X-B for X in network"
        ]

        sd_str = self.get_sd_str()
        title = (f"avg X corrs, filtered {sd_str} "
                 f'({self.script_settings["graph_type"]})')
        plt.title(title)
        plt.ylabel("Norm. Density")
        plt.xlabel("mean(abs(corr(a,x)), abs(corr(x,b))) (SD)")
        plt.legend(legend)
        name = "%s_%s_axb_filtered_hist_comparison.pdf" % (
            sd_str,
            self.script_settings["graph_type"],
        )

        # Save to file or ByteIO and S3
        if od is None:
            fname = BytesIO()
        else:
            fname = od.joinpath(name).as_posix()
        plt.savefig(fname, format="pdf")
        if od is None:
            # Reset pointer
            fname.seek(0)
            # Upload to s3
            full_s3_path = _joinpath(s3_path, name)
            _upload_bytes_io_to_s3(bytes_io_obj=fname, s3p=full_s3_path)

        # Show plot
        if show_plot:
            plt.show()

        # Close figure
        plt.close(fig_index)

Ejemplo n.º 13

0

Mostrar archivo

    def plot_corr_stats(
        self,
        outdir: str,
        z_corr: Optional[Union[str, pd.DataFrame]] = None,
        show_plot: bool = False,
        max_proc: bool = None,
        index_counter: Optional[Union[Iterator, Generator]] = None,
        max_so_pairs_size: int = 10000,
        mp_pairs: bool = True,
        run_linear: bool = False,
        log_scale_y: bool = False,
    ):
        """Plot the results of running explainer.get_corr_stats_axb()

        Parameters
        ----------
        outdir : str
            The output directory to save the plots in. If string starts with
            's3://' upload to s3. outdir must then have the form
            's3://<bucket>/<sub_dir>' where <bucket> must be specified and
            <sub_dir> is optional and may contain subdirectories.
        z_corr : Union[str, pd.DataFrame]
            A pd.DataFrame containing the correlation z scores used to
            create the statistics in this object. If not provided,
            an attempt will be made to load it from the file path present in
            script_settings.
        show_plot : bool
            If True, also show plots after saving them. Default False.
        max_proc : int > 0
            The maximum number of processes to run in the multiprocessing in
            get_corr_stats_mp. Default: multiprocessing.cpu_count()
        index_counter : Union[Iterator, Generator]
            An object which produces a new int by using 'next()' on it. The
            integers are used to separate the figures so as to not append
            new plots in the same figure.
        max_so_pairs_size : int
            The maximum number of correlation pairs to process. If the
            number of eligible pairs is larger than this number, a random
            sample of max_so_pairs_size is used. Default: 10 000.
        mp_pairs : bool
            If True, get the pairs to process using multiprocessing if larger
            than 10 000. Default: True.
        run_linear : bool
            If True, gather the data without multiprocessing. This option is
            good when debugging or if the environment for some reason does
            not support multiprocessing. Default: False.
        log_scale_y : bool
            If True, plot the plots in this method with log10 scale on y-axis.
            Default: False.
        """
        # Local file or s3
        if outdir.startswith("s3://"):
            s3_path = S3Path.from_string(outdir)
            logger.info(f"Outdir path is on S3: {str(s3_path)}")
            od = None
        else:
            s3_path = None
            od = Path(outdir)
            if not od.is_dir():
                logger.info(f"Creating directory/ies for {od}")
                od.mkdir(parents=True, exist_ok=True)

        # Get corr stats
        corr_stats: Results = self.get_corr_stats_axb(
            z_corr=z_corr,
            max_proc=max_proc,
            max_so_pairs_size=max_so_pairs_size,
            mp_pairs=mp_pairs,
            run_linear=run_linear,
        )
        sd_str = self.get_sd_str()
        for m, (plot_type, data) in enumerate(corr_stats.dict().items()):
            if len(data) > 0:
                name = f'{plot_type}_{self.script_settings["graph_type"]}.pdf'
                logger.info(f"Using file name {name}")
                if od is None:
                    fname = BytesIO()
                else:
                    fname = od.joinpath(name).as_posix()
                if isinstance(data[0], tuple):
                    data = [t[-1] for t in data]

                fig_index = next(index_counter) if index_counter else m
                plt.figure(fig_index)
                plt.hist(x=data, bins="auto", log=log_scale_y)
                title = (f'{plot_type.replace("_", " ").capitalize()}; '
                         f'{sd_str} {self.script_settings["graph_type"]}')

                plt.title(title)
                plt.xlabel("combined z-score")
                plt.ylabel("count")

                # Save to file or ByteIO and S3
                plt.savefig(fname, format="pdf")
                if od is None:
                    # Reset pointer
                    fname.seek(0)
                    # Upload to s3
                    full_s3_path = _joinpath(s3_path, name)
                    _upload_bytes_io_to_s3(bytes_io_obj=fname,
                                           s3p=full_s3_path)

                # Show plot
                if show_plot:
                    plt.show()

                # Close figure
                plt.close(fig_index)
            else:
                logger.warning(f"Empty result for {plot_type} in "
                               f"range {sd_str} for graph type "
                               f'{self.script_settings["graph_type"]}')

Ejemplo n.º 14

0

Mostrar archivo

    def get_corr_stats_axb(
        self,
        z_corr: Optional[Union[str, pd.DataFrame]] = None,
        max_proc: Optional[int] = None,
        max_so_pairs_size: int = 10000,
        mp_pairs: bool = True,
        run_linear: bool = False,
    ) -> Results:
        """Get statistics of the correlations from different explanation types

        Note: the provided options have no effect if the data is loaded
        from cache.

        Parameters
        ----------
        z_corr : Optional[Union[pd.DataFrame, str]]
            A pd.DataFrame containing the correlation z scores used to
            create the statistics in this object. Pro
        max_proc : int > 0
            The maximum number of processes to run in the multiprocessing
            in get_corr_stats_mp. Default: multiprocessing.cpu_count()
        max_so_pairs_size : int
            The maximum number of correlation pairs to process. If the
            number of eligible pairs is larger than this number, a random
            sample of max_so_pairs_size is used. Default: 10 000. If the
            number of pairs to check is smaller than 10 000, no sampling is
            done.
        mp_pairs : bool
            If True, get the pairs to process using multiprocessing if larger
            than 10 000. Default: True.
        run_linear : bool
            If True, gather the data without multiprocessing. This option is
            good when debugging or if the environment for some reason does
            not support multiprocessing. Default: False.

        Returns
        -------
        Results
            A BaseModel containing correlation data for different explanations
        """
        if not self.corr_stats_axb:
            s3 = get_s3_client(unsigned=False)
            try:
                corr_stats_loc = self.get_s3_corr_stats_path()
                if S3Path.from_string(corr_stats_loc).exists(s3):
                    logger.info(f"Found corr stats data at {corr_stats_loc}")
                    corr_stats_json = file_opener(corr_stats_loc)
                    self.corr_stats_axb = Results(**corr_stats_json)
                else:
                    logger.info(f"No corr stats data at found at "
                                f"{corr_stats_loc}")
            except ValueError as ve:
                # Raised when s3 location is not set
                logger.warning(ve)

            # If not found on s3 or ValueError was raised
            if not self.corr_stats_axb:
                logger.info("Generating corr stats data")
                # Load correlation matrix
                if z_corr is None:
                    z_corr = self.load_z_corr()
                if isinstance(z_corr, str):
                    z_corr = self.load_z_corr(local_file_path=z_corr)
                # Load reactome if present
                try:
                    reactome = self.load_reactome()
                except FileNotFoundError:
                    logger.info("No reactome file used in script")
                    reactome = None
                self.corr_stats_axb: Results = axb_stats(
                    self.expl_df,
                    self.stats_df,
                    z_corr=z_corr,
                    reactome=reactome,
                    eval_str=False,
                    max_proc=max_proc,
                    max_corr_pairs=max_so_pairs_size,
                    do_mp_pairs=mp_pairs,
                    run_linear=run_linear,
                )
                try:
                    corr_stats_loc = self.get_s3_corr_stats_path()
                    logger.info(f"Uploading corr stats to S3 at "
                                f"{corr_stats_loc}")
                    s3p_loc = S3Path.from_string(corr_stats_loc)
                    s3p_loc.put(s3=s3, body=self.corr_stats_axb.json())
                    logger.info("Finished uploading corr stats to S3")
                except ValueError:
                    logger.warning("Unable to upload corr stats to S3")
        else:
            logger.info("Data already present in corr_stats_axb")
        return self.corr_stats_axb

Ejemplo n.º 15

0

Mostrar archivo

Archivo: belief.py Proyecto: steppi/indra_db

                                             sup_links=list(sg.edges))
                beliefs.update(calculate_belief(stmts))
                group = set()
        return beliefs
    else:
        stmts = load_mock_statements(db)
        return calculate_belief(stmts)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='DB Belief Score Dumper')
    parser.add_argument('--fname',
                        nargs='?',
                        type=str,
                        default='belief_dict.pkl',
                        help='Filename of the belief dict output')
    parser.add_argument('-s3',
                        action='store_true',
                        default=False,
                        help='Upload belief dict to the bigmech s3 bucket '
                        'instead of saving it locally')
    args = parser.parse_args()
    belief_dict = get_belief()
    if args.s3:
        key = '/'.join([datetime.utcnow().strftime('%Y-%m-%d'), args.fname])
        s3_path = S3Path(S3_SUBDIR, key)
        upload_pickle_to_s3(obj=belief_dict, s3_path=s3_path)
    else:
        with open(args.fname, 'wb') as f:
            pickle.dump(belief_dict, f)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: xdd_manager.py Proyecto: kolusask/indra_db

class XddManager:
    bucket = S3Path(bucket='hms-uw-collaboration')
    reader_versions = {'REACH': '1.3.3-61059a-biores-e9ee36',
                       'SPARSER': 'February2020-linux'}
    indra_version = '1.16.0-c439fdbc936f4eac00cafd559927d7ee06c492e8'

    def __init__(self):
        self.groups = None
        self.statements = None
        self.text_content = None

    def load_groups(self, db):
        logger.info("Finding groups that have not been handled yet.")
        s3 = boto3.client('s3')
        groups = self.bucket.list_prefixes(s3)
        previous_groups = {s for s, in db.select_all(db.XddUpdates.day_str)}

        self.groups = [group for group in groups
                       if group.key[:-1] not in previous_groups]
        return

    def load_statements(self, db):
        logger.info("Loading statements.")
        s3 = boto3.client('s3')
        self.statements = defaultdict(lambda: defaultdict(list))
        self.text_content = {}
        for group in self.groups:
            logger.info(f"Processing {group.key}")
            file_pair_dict = _get_file_pairs_from_group(s3, group)
            for (run_id, id_src), (bibs, stmts) in file_pair_dict.items():
                logger.info(f"Loading {run_id}")
                doi_lookup = {bib['_xddid']: bib['identifier'][0]['id'].upper()
                              for bib in bibs if 'identifier' in bib}
                pub_lookup = {bib['_xddid']: bib['publisher'] for bib in bibs}
                dois = {doi for doi in doi_lookup.values()}
                trids = _get_trids_from_dois(db, dois)

                for sj in stmts:
                    ev = sj['evidence'][0]
                    xddid = ev['text_refs']['CONTENT_ID']
                    ev.pop('pmid', None)
                    if xddid not in doi_lookup:
                        logger.warning("Skipping statement because bib "
                                       "lacked a DOI.")
                        continue
                    ev['text_refs']['DOI'] = doi_lookup[xddid]

                    trid = trids[doi_lookup[xddid]]
                    ev['text_refs']['TRID'] = trid
                    ev['text_refs']['XDD_RUN_ID'] = run_id
                    ev['text_refs']['XDD_GROUP_ID'] = group.key

                    self.statements[trid][ev['text_refs']['READER']].append(sj)
                    if trid not in self.text_content:
                        if id_src:
                            src = f'xdd-{id_src}'
                        else:
                            src = 'xdd'
                        self.text_content[trid] = \
                            (trid, src, 'xdd', 'fulltext',
                             pub_lookup[xddid] == 'bioRxiv')
        return

    def dump_statements(self, db):
        tc_rows = set(self.text_content.values())
        tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint')
        logger.info(f"Dumping {len(tc_rows)} text content.")
        db.copy_lazy('text_content', tc_rows, tc_cols)

        # Look up tcids for newly entered content.
        tcids = db.select_all(
            [db.TextContent.text_ref_id, db.TextContent.id],
            db.TextContent.text_ref_id.in_(self.statements.keys()),
            db.TextContent.format == 'xdd'
        )
        tcid_lookup = {trid: tcid for trid, tcid in tcids}

        # Compile reading and statements into rows.
        r_rows = set()
        r_cols = ('id', 'text_content_id', 'reader', 'reader_version',
                  'format', 'batch_id')
        s_rows = set()
        rd_batch_id = db.make_copy_batch_id()
        stmt_batch_id = db.make_copy_batch_id()
        stmts = []
        for trid, trid_set in self.statements.items():
            for reader, stmt_list in trid_set.items():
                tcid = tcid_lookup[trid]
                reader_version = self.reader_versions[reader.upper()]
                reading_id = generate_reading_id(tcid, reader, reader_version)
                r_rows.add((reading_id, tcid, reader.upper(), reader_version,
                            'xdd', rd_batch_id))
                for sj in stmt_list:
                    stmt = Statement._from_json(sj)
                    stmts.append(stmt)
                    sd = DatabaseStatementData(
                        stmt,
                        reading_id,
                        indra_version=self.indra_version
                    )
                    s_rows.add(sd.make_tuple(stmt_batch_id))

        logger.info(f"Dumping {len(r_rows)} readings.")
        db.copy_lazy('reading', r_rows, r_cols, commit=False)

        logger.info(f"Dumping {len(s_rows)} raw statements.")
        db.copy_lazy('raw_statements', s_rows,
                     DatabaseStatementData.get_cols(), commit=False)
        if len(stmts):
            insert_raw_agents(db, stmt_batch_id, stmts, verbose=False,
                              commit=False)

        update_rows = [(json.dumps(self.reader_versions), self.indra_version,
                        group.key[:-1])
                       for group in self.groups]
        db.copy('xdd_updates', update_rows,
                ('reader_versions', 'indra_version', 'day_str'))
        return

    def run(self, db):
        self.load_groups(db)
        self.load_statements(db)
        self.dump_statements(db)