def _joinpath(fpath: Union[S3Path, Path], other: str) -> Union[S3Path, Path]: if isinstance(fpath, Path): return fpath.joinpath(other).absolute() else: if (fpath.to_string().endswith("/") and not other.startswith("/") or not fpath.to_string().endswith("/") and other.startswith("/")): return S3Path.from_string(fpath.to_string() + other) elif fpath.to_string().endswith("/") and other.startswith("/"): return S3Path.from_string(fpath.to_string() + other[1:]) elif not fpath.to_string().endswith("/") and not other.startswith("/"): return S3Path.from_string(fpath.to_string() + "/" + other) else: raise ValueError(f"Unable to join {fpath.to_string()} and " f'{other} with "/"')
def pg_restore(self, dump_file, **options): """Load content into the database from a dump file on s3.""" if isinstance(dump_file, str): dump_file = S3Path.from_string(dump_file) elif dump_file is not None and not isinstance(dump_file, S3Path): raise ValueError("Argument `dump_file` must be appropriately " "formatted string or S3Path object, not %s." % type(dump_file)) from subprocess import run from os import environ self.session.close() self.grab_session() # Add the password to the env my_env = environ.copy() my_env['PGPASSWORD'] = self.url.password # Pipe the database dump from s3 through this machine into the database logger.info("Dumping into the database.") option_list = [f'--{opt}' if isinstance(val, bool) and val else f'--{opt}={val}' for opt, val in options.items()] run(' '.join(['aws', 's3', 'cp', dump_file.to_string(), '-', '|', 'pg_restore', *self._form_pg_args(), *option_list, '--no-owner']), env=my_env, shell=True, check=True) self.session.close() self.grab_session() return dump_file
def _main(): parser = _make_parser() args = parser.parse_args() if args.debug: logger.setLevel(logging.DEBUG) from indra_db.databases import logger as db_logger db_logger.setLevel(logging.DEBUG) print("Getting %s database." % args.database) db = get_db(args.database) assert db is not None db.grab_session() s3_cache = S3Path.from_string(args.cache) pa = DbPreassembler(args.batch, s3_cache, stmt_type=args.stmt_type, yes_all=args.yes_all) desc = 'Continuing' if args.continuing else 'Beginning' print("%s to %s preassembled corpus." % (desc, args.task)) if args.task == 'create': pa.create_corpus(db, args.continuing) elif args.task == 'update': pa.supplement_corpus(db, args.continuing) else: raise IndraDBPreassemblyError('Unrecognized task: %s.' % args.task)
def get_s3_path(self) -> S3Path: """Return an S3Path object of the saved s3 location Returns ------- S3Path """ if self.s3_location is None: raise ValueError("s3_location is not set") return S3Path.from_string(self.s3_location)
def pg_dump(self, dump_file, **options): """Use the pg_dump command to dump part of the database onto s3. The `pg_dump` tool must be installed, and must be a compatible version with the database(s) being used. All keyword arguments are converted into flags/arguments of pg_dump. For documentation run `pg_dump --help`. This will also confirm you have `pg_dump` installed. By default, the "General" and "Connection" options are already set. The most likely specification you will want to use is `--table` or `--schema`, specifying either a particular table or schema to dump. Parameters ---------- dump_file : S3Path or str The location on s3 where the content should be dumped. """ if isinstance(dump_file, str): dump_file = S3Path.from_string(dump_file) elif dump_file is not None and not isinstance(dump_file, S3Path): raise ValueError("Argument `dump_file` must be appropriately " "formatted string or S3Path object, not %s." % type(dump_file)) from subprocess import check_call from os import environ # Make sure the session is fresh and any previous session are done. self.session.close() self.grab_session() # Add the password to the env my_env = environ.copy() my_env['PGPASSWORD'] = self.url.password # Dump the database onto s3, piping through this machine (errors if # anything went wrong). option_list = [f'--{opt}' if isinstance(val, bool) and val else f'--{opt}={val}' for opt, val in options.items()] cmd = ' '.join(["pg_dump", *self._form_pg_args(), *option_list, '-Fc', '|', 'aws', 's3', 'cp', '-', dump_file.to_string()]) check_call(cmd, shell=True, env=my_env) return dump_file
def plot_interesting( self, outdir: str, z_corr: Optional[Union[str, pd.DataFrame]] = None, show_plot: Optional[bool] = False, max_proc: Optional[int] = None, index_counter: Optional[Union[Iterator, Generator]] = None, max_so_pairs_size: int = 10000, mp_pairs: bool = True, run_linear: bool = False, log_scale_y: bool = False, ): """Plots the same type of plot as plot_dists, but filters A, B A, B are filtered to those that fulfill the following: - No a-b or b-a explanations - Not explained by apriori explanations - Without common reactome pathways - With a-x-b, b-x-a or shared target explanation Parameters ---------- outdir : str The output directory to save the plots in. If string starts with 's3://' upload to s3. outdir must then have the form 's3://<bucket>/<sub_dir>' where <bucket> must be specified and <sub_dir> is optional and may contain subdirectories. z_corr : Union[str, pd.DataFrame] A pd.DataFrame containing the correlation z scores used to create the statistics in this object. If not provided, an attempt will be made to load it from the file path present in script_settings. show_plot : bool If True also show plots max_proc : int > 0 The maximum number of processes to run in the multiprocessing in get_corr_stats_mp. Default: multiprocessing.cpu_count() index_counter : Union[Iterator, Generator] An object which produces a new int by using 'next()' on it. The integers are used to separate the figures so as to not append new plots in the same figure. max_so_pairs_size : int The maximum number of correlation pairs to process. If the number of eligible pairs is larger than this number, a random sample of max_so_pairs_size is used. Default: 10000. mp_pairs : bool If True, get the pairs to process using multiprocessing if larger than 10 000. Default: True. run_linear : bool If True, gather the data without multiprocessing. This option is good when debugging or if the environment for some reason does not support multiprocessing. Default: False. log_scale_y : bool If True, plot the plots in this method with log10 scale on y-axis. Default: False. """ # Local file or s3 if outdir.startswith("s3://"): s3_path = S3Path.from_string(outdir) od = None else: s3_path = None od = Path(outdir) if not od.is_dir(): od.mkdir(parents=True, exist_ok=True) # Get corr stats corr_stats: Results = self.get_corr_stats_axb( z_corr=z_corr, max_proc=max_proc, max_so_pairs_size=max_so_pairs_size, mp_pairs=mp_pairs, run_linear=run_linear, ) fig_index = (next(index_counter) if index_counter else floor( datetime.timestamp(datetime.utcnow()))) plt.figure(fig_index) plt.hist( corr_stats.azfb_avg_corrs, bins="auto", density=True, color="b", alpha=0.3, log=log_scale_y, ) plt.hist( corr_stats.avg_x_filtered_corrs, bins="auto", density=True, color="r", alpha=0.3, log=log_scale_y, ) legend = [ "Filtered A-X-B for any X", "Filtered A-X-B for X in network" ] sd_str = self.get_sd_str() title = (f"avg X corrs, filtered {sd_str} " f'({self.script_settings["graph_type"]})') plt.title(title) plt.ylabel("Norm. Density") plt.xlabel("mean(abs(corr(a,x)), abs(corr(x,b))) (SD)") plt.legend(legend) name = "%s_%s_axb_filtered_hist_comparison.pdf" % ( sd_str, self.script_settings["graph_type"], ) # Save to file or ByteIO and S3 if od is None: fname = BytesIO() else: fname = od.joinpath(name).as_posix() plt.savefig(fname, format="pdf") if od is None: # Reset pointer fname.seek(0) # Upload to s3 full_s3_path = _joinpath(s3_path, name) _upload_bytes_io_to_s3(bytes_io_obj=fname, s3p=full_s3_path) # Show plot if show_plot: plt.show() # Close figure plt.close(fig_index)
def plot_corr_stats( self, outdir: str, z_corr: Optional[Union[str, pd.DataFrame]] = None, show_plot: bool = False, max_proc: bool = None, index_counter: Optional[Union[Iterator, Generator]] = None, max_so_pairs_size: int = 10000, mp_pairs: bool = True, run_linear: bool = False, log_scale_y: bool = False, ): """Plot the results of running explainer.get_corr_stats_axb() Parameters ---------- outdir : str The output directory to save the plots in. If string starts with 's3://' upload to s3. outdir must then have the form 's3://<bucket>/<sub_dir>' where <bucket> must be specified and <sub_dir> is optional and may contain subdirectories. z_corr : Union[str, pd.DataFrame] A pd.DataFrame containing the correlation z scores used to create the statistics in this object. If not provided, an attempt will be made to load it from the file path present in script_settings. show_plot : bool If True, also show plots after saving them. Default False. max_proc : int > 0 The maximum number of processes to run in the multiprocessing in get_corr_stats_mp. Default: multiprocessing.cpu_count() index_counter : Union[Iterator, Generator] An object which produces a new int by using 'next()' on it. The integers are used to separate the figures so as to not append new plots in the same figure. max_so_pairs_size : int The maximum number of correlation pairs to process. If the number of eligible pairs is larger than this number, a random sample of max_so_pairs_size is used. Default: 10 000. mp_pairs : bool If True, get the pairs to process using multiprocessing if larger than 10 000. Default: True. run_linear : bool If True, gather the data without multiprocessing. This option is good when debugging or if the environment for some reason does not support multiprocessing. Default: False. log_scale_y : bool If True, plot the plots in this method with log10 scale on y-axis. Default: False. """ # Local file or s3 if outdir.startswith("s3://"): s3_path = S3Path.from_string(outdir) logger.info(f"Outdir path is on S3: {str(s3_path)}") od = None else: s3_path = None od = Path(outdir) if not od.is_dir(): logger.info(f"Creating directory/ies for {od}") od.mkdir(parents=True, exist_ok=True) # Get corr stats corr_stats: Results = self.get_corr_stats_axb( z_corr=z_corr, max_proc=max_proc, max_so_pairs_size=max_so_pairs_size, mp_pairs=mp_pairs, run_linear=run_linear, ) sd_str = self.get_sd_str() for m, (plot_type, data) in enumerate(corr_stats.dict().items()): if len(data) > 0: name = f'{plot_type}_{self.script_settings["graph_type"]}.pdf' logger.info(f"Using file name {name}") if od is None: fname = BytesIO() else: fname = od.joinpath(name).as_posix() if isinstance(data[0], tuple): data = [t[-1] for t in data] fig_index = next(index_counter) if index_counter else m plt.figure(fig_index) plt.hist(x=data, bins="auto", log=log_scale_y) title = (f'{plot_type.replace("_", " ").capitalize()}; ' f'{sd_str} {self.script_settings["graph_type"]}') plt.title(title) plt.xlabel("combined z-score") plt.ylabel("count") # Save to file or ByteIO and S3 plt.savefig(fname, format="pdf") if od is None: # Reset pointer fname.seek(0) # Upload to s3 full_s3_path = _joinpath(s3_path, name) _upload_bytes_io_to_s3(bytes_io_obj=fname, s3p=full_s3_path) # Show plot if show_plot: plt.show() # Close figure plt.close(fig_index) else: logger.warning(f"Empty result for {plot_type} in " f"range {sd_str} for graph type " f'{self.script_settings["graph_type"]}')
def get_corr_stats_axb( self, z_corr: Optional[Union[str, pd.DataFrame]] = None, max_proc: Optional[int] = None, max_so_pairs_size: int = 10000, mp_pairs: bool = True, run_linear: bool = False, ) -> Results: """Get statistics of the correlations from different explanation types Note: the provided options have no effect if the data is loaded from cache. Parameters ---------- z_corr : Optional[Union[pd.DataFrame, str]] A pd.DataFrame containing the correlation z scores used to create the statistics in this object. Pro max_proc : int > 0 The maximum number of processes to run in the multiprocessing in get_corr_stats_mp. Default: multiprocessing.cpu_count() max_so_pairs_size : int The maximum number of correlation pairs to process. If the number of eligible pairs is larger than this number, a random sample of max_so_pairs_size is used. Default: 10 000. If the number of pairs to check is smaller than 10 000, no sampling is done. mp_pairs : bool If True, get the pairs to process using multiprocessing if larger than 10 000. Default: True. run_linear : bool If True, gather the data without multiprocessing. This option is good when debugging or if the environment for some reason does not support multiprocessing. Default: False. Returns ------- Results A BaseModel containing correlation data for different explanations """ if not self.corr_stats_axb: s3 = get_s3_client(unsigned=False) try: corr_stats_loc = self.get_s3_corr_stats_path() if S3Path.from_string(corr_stats_loc).exists(s3): logger.info(f"Found corr stats data at {corr_stats_loc}") corr_stats_json = file_opener(corr_stats_loc) self.corr_stats_axb = Results(**corr_stats_json) else: logger.info(f"No corr stats data at found at " f"{corr_stats_loc}") except ValueError as ve: # Raised when s3 location is not set logger.warning(ve) # If not found on s3 or ValueError was raised if not self.corr_stats_axb: logger.info("Generating corr stats data") # Load correlation matrix if z_corr is None: z_corr = self.load_z_corr() if isinstance(z_corr, str): z_corr = self.load_z_corr(local_file_path=z_corr) # Load reactome if present try: reactome = self.load_reactome() except FileNotFoundError: logger.info("No reactome file used in script") reactome = None self.corr_stats_axb: Results = axb_stats( self.expl_df, self.stats_df, z_corr=z_corr, reactome=reactome, eval_str=False, max_proc=max_proc, max_corr_pairs=max_so_pairs_size, do_mp_pairs=mp_pairs, run_linear=run_linear, ) try: corr_stats_loc = self.get_s3_corr_stats_path() logger.info(f"Uploading corr stats to S3 at " f"{corr_stats_loc}") s3p_loc = S3Path.from_string(corr_stats_loc) s3p_loc.put(s3=s3, body=self.corr_stats_axb.json()) logger.info("Finished uploading corr stats to S3") except ValueError: logger.warning("Unable to upload corr stats to S3") else: logger.info("Data already present in corr_stats_axb") return self.corr_stats_axb