def _upload_bytes_io_to_s3(bytes_io_obj: BytesIO, s3p: S3Path): """Upload a BytesIO object to s3 Parameters ---------- bytes_io_obj : BytesIO Object to upload s3p : S3Path An S3Path instance of the full upload url """ logger.info(f"Uploading BytesIO object to s3: {str(s3p)}") bytes_io_obj.seek(0) # Just in case s3 = get_s3_client(unsigned=False) s3p.put(body=bytes_io_obj, s3=s3)
def _joinpath(fpath: Union[S3Path, Path], other: str) -> Union[S3Path, Path]: if isinstance(fpath, Path): return fpath.joinpath(other).absolute() else: if (fpath.to_string().endswith("/") and not other.startswith("/") or not fpath.to_string().endswith("/") and other.startswith("/")): return S3Path.from_string(fpath.to_string() + other) elif fpath.to_string().endswith("/") and other.startswith("/"): return S3Path.from_string(fpath.to_string() + other[1:]) elif not fpath.to_string().endswith("/") and not other.startswith("/"): return S3Path.from_string(fpath.to_string() + "/" + other) else: raise ValueError(f"Unable to join {fpath.to_string()} and " f'{other} with "/"')
def pg_restore(self, dump_file, **options): """Load content into the database from a dump file on s3.""" if isinstance(dump_file, str): dump_file = S3Path.from_string(dump_file) elif dump_file is not None and not isinstance(dump_file, S3Path): raise ValueError("Argument `dump_file` must be appropriately " "formatted string or S3Path object, not %s." % type(dump_file)) from subprocess import run from os import environ self.session.close() self.grab_session() # Add the password to the env my_env = environ.copy() my_env['PGPASSWORD'] = self.url.password # Pipe the database dump from s3 through this machine into the database logger.info("Dumping into the database.") option_list = [f'--{opt}' if isinstance(val, bool) and val else f'--{opt}={val}' for opt, val in options.items()] run(' '.join(['aws', 's3', 'cp', dump_file.to_string(), '-', '|', 'pg_restore', *self._form_pg_args(), *option_list, '--no-owner']), env=my_env, shell=True, check=True) self.session.close() self.grab_session() return dump_file
def _get_preassembler(): s3 = boto3.client('s3') test_ontology_path = S3Path(bucket='bigmech', key='travis/bio_ontology/1.4/mock_ontology.pkl') test_ontology = pickle.loads(test_ontology_path.get(s3)['Body'].read()) print("Loaded test ontology.") return DbPreassembler(yes_all=True, ontology=test_ontology)
def _main(): parser = _make_parser() args = parser.parse_args() if args.debug: logger.setLevel(logging.DEBUG) from indra_db.databases import logger as db_logger db_logger.setLevel(logging.DEBUG) print("Getting %s database." % args.database) db = get_db(args.database) assert db is not None db.grab_session() s3_cache = S3Path.from_string(args.cache) pa = DbPreassembler(args.batch, s3_cache, stmt_type=args.stmt_type, yes_all=args.yes_all) desc = 'Continuing' if args.continuing else 'Beginning' print("%s to %s preassembled corpus." % (desc, args.task)) if args.task == 'create': pa.create_corpus(db, args.continuing) elif args.task == 'update': pa.supplement_corpus(db, args.continuing) else: raise IndraDBPreassemblyError('Unrecognized task: %s.' % args.task)
def list_dumps(): s3_base = get_s3_dump() s3 = boto3.client('s3') res = s3.list_objects_v2(Delimiter='/', **s3_base.kw(prefix=True)) return [ S3Path.from_key_parts(s3_base.bucket, d['Prefix']) for d in res['CommonPrefixes'] ]
def get_s3_path(self) -> S3Path: """Return an S3Path object of the saved s3 location Returns ------- S3Path """ if self.s3_location is None: raise ValueError("s3_location is not set") return S3Path.from_string(self.s3_location)
def list_dumps(started=None, ended=None): """List all dumps, optionally filtered by their status. Parameters ---------- started : Optional[bool] If True, find dumps that have started. If False, find dumps that have NOT been started. If None, do not filter by start status. ended : Optional[bool] The same as `started`, but checking whether the dump is ended or not. Returns ------- list of S3Path objects Each S3Path object contains the bucket and key prefix information for a set of dump files, e.g. [S3Path(bigmech, indra-db/dumps/2020-07-16/), S3Path(bigmech, indra-db/dumps/2020-08-28/), S3Path(bigmech, indra-db/dumps/2020-09-18/), S3Path(bigmech, indra-db/dumps/2020-11-12/), S3Path(bigmech, indra-db/dumps/2020-11-13/)] """ # Get all the dump "directories". s3_base = get_s3_dump() s3 = boto3.client('s3') res = s3.list_objects_v2(Delimiter='/', **s3_base.kw(prefix=True)) if res['KeyCount'] == 0: return [] dumps = [ S3Path.from_key_parts(s3_base.bucket, d['Prefix']) for d in res['CommonPrefixes'] ] # Filter to those that have "started" if started is not None: dumps = [ p for p in dumps if p.get_element_path(Start.file_name()).exists(s3) == started ] # Filter to those that have "ended" if ended is not None: dumps = [ p for p in dumps if p.get_element_path(End.file_name()).exists(s3) == ended ] return dumps
def pg_dump(self, dump_file, **options): """Use the pg_dump command to dump part of the database onto s3. The `pg_dump` tool must be installed, and must be a compatible version with the database(s) being used. All keyword arguments are converted into flags/arguments of pg_dump. For documentation run `pg_dump --help`. This will also confirm you have `pg_dump` installed. By default, the "General" and "Connection" options are already set. The most likely specification you will want to use is `--table` or `--schema`, specifying either a particular table or schema to dump. Parameters ---------- dump_file : S3Path or str The location on s3 where the content should be dumped. """ if isinstance(dump_file, str): dump_file = S3Path.from_string(dump_file) elif dump_file is not None and not isinstance(dump_file, S3Path): raise ValueError("Argument `dump_file` must be appropriately " "formatted string or S3Path object, not %s." % type(dump_file)) from subprocess import check_call from os import environ # Make sure the session is fresh and any previous session are done. self.session.close() self.grab_session() # Add the password to the env my_env = environ.copy() my_env['PGPASSWORD'] = self.url.password # Dump the database onto s3, piping through this machine (errors if # anything went wrong). option_list = [f'--{opt}' if isinstance(val, bool) and val else f'--{opt}={val}' for opt, val in options.items()] cmd = ' '.join(["pg_dump", *self._form_pg_args(), *option_list, '-Fc', '|', 'aws', 's3', 'cp', '-', dump_file.to_string()]) check_call(cmd, shell=True, env=my_env) return dump_file
def get_latest_dump_file(): import boto3 from indra.util.aws import iter_s3_keys from indra_db.config import get_s3_dump s3 = boto3.client('s3') s3_path = get_s3_dump() logger.debug("Looking for the latest dump file on s3 to %s." % s3_path) # Get the most recent file from s3. max_date_str = None max_lm_date = None latest_key = None for key, lm_date in iter_s3_keys(s3, with_dt=True, **s3_path.kw()): # Get the date string from the name, ignoring non-standard files. suffix = key.split('/')[-1] m = re.match('readonly-(\S+).dump', suffix) if m is None: logger.debug("{key} is not a standard key, will not be " "considered.".format(key=key)) continue date_str, = m.groups() # Compare the the current maxes. If the date_str and the last # -modified date don't agree, raise an error. if not max_lm_date \ or date_str > max_date_str and lm_date > max_lm_date: max_date_str = date_str max_lm_date = lm_date latest_key = key elif max_lm_date \ and (date_str > max_date_str or lm_date > max_lm_date): raise S3DumpTimeAmbiguityError(key, date_str > max_date_str, lm_date > max_lm_date) logger.debug("Latest dump file from %s was found to be %s." % (s3_path, latest_key)) return S3Path(s3_path.bucket, latest_key)
def _get_file_pairs_from_group(s3, group: S3Path): files = group.list_objects(s3) file_pairs = defaultdict(dict) got_all = True for file_path in files: # Get information from the filename, including the cases with and # without the id_src label. parts = file_path.key.split('_') if len(parts) == 2: run_id, file_suffix = parts id_src = None elif len(parts) == 3: run_id, id_src, file_suffix = parts else: raise XDDFileError(f"XDD file does not match known standards: " f"{file_path.key}") file_type = file_suffix.split('.')[0] # Try getting the file try: file_obj = s3.get_object(**file_path.kw()) file_json = json.loads(file_obj['Body'].read()) file_pairs[(run_id, id_src)][file_type] = file_json except Exception as e: logger.error(f"Failed to load {file_path}") logger.exception(e) if run_id in file_pairs: del file_pairs[run_id] got_all = False # Create a dict of tuples from the pairs of files. ret = {} for batch_id, files in file_pairs.items(): if len(files) != 2 or 'bib' not in files or 'stmts' not in files: logger.warning(f"Run {batch_id} does not have both 'bib' and " f"'stmts' in files: {files.keys()}. Skipping.") got_all = False continue ret[batch_id] = (files['bib'], files['stmts']) return ret, got_all
def plot_interesting( self, outdir: str, z_corr: Optional[Union[str, pd.DataFrame]] = None, show_plot: Optional[bool] = False, max_proc: Optional[int] = None, index_counter: Optional[Union[Iterator, Generator]] = None, max_so_pairs_size: int = 10000, mp_pairs: bool = True, run_linear: bool = False, log_scale_y: bool = False, ): """Plots the same type of plot as plot_dists, but filters A, B A, B are filtered to those that fulfill the following: - No a-b or b-a explanations - Not explained by apriori explanations - Without common reactome pathways - With a-x-b, b-x-a or shared target explanation Parameters ---------- outdir : str The output directory to save the plots in. If string starts with 's3://' upload to s3. outdir must then have the form 's3://<bucket>/<sub_dir>' where <bucket> must be specified and <sub_dir> is optional and may contain subdirectories. z_corr : Union[str, pd.DataFrame] A pd.DataFrame containing the correlation z scores used to create the statistics in this object. If not provided, an attempt will be made to load it from the file path present in script_settings. show_plot : bool If True also show plots max_proc : int > 0 The maximum number of processes to run in the multiprocessing in get_corr_stats_mp. Default: multiprocessing.cpu_count() index_counter : Union[Iterator, Generator] An object which produces a new int by using 'next()' on it. The integers are used to separate the figures so as to not append new plots in the same figure. max_so_pairs_size : int The maximum number of correlation pairs to process. If the number of eligible pairs is larger than this number, a random sample of max_so_pairs_size is used. Default: 10000. mp_pairs : bool If True, get the pairs to process using multiprocessing if larger than 10 000. Default: True. run_linear : bool If True, gather the data without multiprocessing. This option is good when debugging or if the environment for some reason does not support multiprocessing. Default: False. log_scale_y : bool If True, plot the plots in this method with log10 scale on y-axis. Default: False. """ # Local file or s3 if outdir.startswith("s3://"): s3_path = S3Path.from_string(outdir) od = None else: s3_path = None od = Path(outdir) if not od.is_dir(): od.mkdir(parents=True, exist_ok=True) # Get corr stats corr_stats: Results = self.get_corr_stats_axb( z_corr=z_corr, max_proc=max_proc, max_so_pairs_size=max_so_pairs_size, mp_pairs=mp_pairs, run_linear=run_linear, ) fig_index = (next(index_counter) if index_counter else floor( datetime.timestamp(datetime.utcnow()))) plt.figure(fig_index) plt.hist( corr_stats.azfb_avg_corrs, bins="auto", density=True, color="b", alpha=0.3, log=log_scale_y, ) plt.hist( corr_stats.avg_x_filtered_corrs, bins="auto", density=True, color="r", alpha=0.3, log=log_scale_y, ) legend = [ "Filtered A-X-B for any X", "Filtered A-X-B for X in network" ] sd_str = self.get_sd_str() title = (f"avg X corrs, filtered {sd_str} " f'({self.script_settings["graph_type"]})') plt.title(title) plt.ylabel("Norm. Density") plt.xlabel("mean(abs(corr(a,x)), abs(corr(x,b))) (SD)") plt.legend(legend) name = "%s_%s_axb_filtered_hist_comparison.pdf" % ( sd_str, self.script_settings["graph_type"], ) # Save to file or ByteIO and S3 if od is None: fname = BytesIO() else: fname = od.joinpath(name).as_posix() plt.savefig(fname, format="pdf") if od is None: # Reset pointer fname.seek(0) # Upload to s3 full_s3_path = _joinpath(s3_path, name) _upload_bytes_io_to_s3(bytes_io_obj=fname, s3p=full_s3_path) # Show plot if show_plot: plt.show() # Close figure plt.close(fig_index)
def plot_corr_stats( self, outdir: str, z_corr: Optional[Union[str, pd.DataFrame]] = None, show_plot: bool = False, max_proc: bool = None, index_counter: Optional[Union[Iterator, Generator]] = None, max_so_pairs_size: int = 10000, mp_pairs: bool = True, run_linear: bool = False, log_scale_y: bool = False, ): """Plot the results of running explainer.get_corr_stats_axb() Parameters ---------- outdir : str The output directory to save the plots in. If string starts with 's3://' upload to s3. outdir must then have the form 's3://<bucket>/<sub_dir>' where <bucket> must be specified and <sub_dir> is optional and may contain subdirectories. z_corr : Union[str, pd.DataFrame] A pd.DataFrame containing the correlation z scores used to create the statistics in this object. If not provided, an attempt will be made to load it from the file path present in script_settings. show_plot : bool If True, also show plots after saving them. Default False. max_proc : int > 0 The maximum number of processes to run in the multiprocessing in get_corr_stats_mp. Default: multiprocessing.cpu_count() index_counter : Union[Iterator, Generator] An object which produces a new int by using 'next()' on it. The integers are used to separate the figures so as to not append new plots in the same figure. max_so_pairs_size : int The maximum number of correlation pairs to process. If the number of eligible pairs is larger than this number, a random sample of max_so_pairs_size is used. Default: 10 000. mp_pairs : bool If True, get the pairs to process using multiprocessing if larger than 10 000. Default: True. run_linear : bool If True, gather the data without multiprocessing. This option is good when debugging or if the environment for some reason does not support multiprocessing. Default: False. log_scale_y : bool If True, plot the plots in this method with log10 scale on y-axis. Default: False. """ # Local file or s3 if outdir.startswith("s3://"): s3_path = S3Path.from_string(outdir) logger.info(f"Outdir path is on S3: {str(s3_path)}") od = None else: s3_path = None od = Path(outdir) if not od.is_dir(): logger.info(f"Creating directory/ies for {od}") od.mkdir(parents=True, exist_ok=True) # Get corr stats corr_stats: Results = self.get_corr_stats_axb( z_corr=z_corr, max_proc=max_proc, max_so_pairs_size=max_so_pairs_size, mp_pairs=mp_pairs, run_linear=run_linear, ) sd_str = self.get_sd_str() for m, (plot_type, data) in enumerate(corr_stats.dict().items()): if len(data) > 0: name = f'{plot_type}_{self.script_settings["graph_type"]}.pdf' logger.info(f"Using file name {name}") if od is None: fname = BytesIO() else: fname = od.joinpath(name).as_posix() if isinstance(data[0], tuple): data = [t[-1] for t in data] fig_index = next(index_counter) if index_counter else m plt.figure(fig_index) plt.hist(x=data, bins="auto", log=log_scale_y) title = (f'{plot_type.replace("_", " ").capitalize()}; ' f'{sd_str} {self.script_settings["graph_type"]}') plt.title(title) plt.xlabel("combined z-score") plt.ylabel("count") # Save to file or ByteIO and S3 plt.savefig(fname, format="pdf") if od is None: # Reset pointer fname.seek(0) # Upload to s3 full_s3_path = _joinpath(s3_path, name) _upload_bytes_io_to_s3(bytes_io_obj=fname, s3p=full_s3_path) # Show plot if show_plot: plt.show() # Close figure plt.close(fig_index) else: logger.warning(f"Empty result for {plot_type} in " f"range {sd_str} for graph type " f'{self.script_settings["graph_type"]}')
def get_corr_stats_axb( self, z_corr: Optional[Union[str, pd.DataFrame]] = None, max_proc: Optional[int] = None, max_so_pairs_size: int = 10000, mp_pairs: bool = True, run_linear: bool = False, ) -> Results: """Get statistics of the correlations from different explanation types Note: the provided options have no effect if the data is loaded from cache. Parameters ---------- z_corr : Optional[Union[pd.DataFrame, str]] A pd.DataFrame containing the correlation z scores used to create the statistics in this object. Pro max_proc : int > 0 The maximum number of processes to run in the multiprocessing in get_corr_stats_mp. Default: multiprocessing.cpu_count() max_so_pairs_size : int The maximum number of correlation pairs to process. If the number of eligible pairs is larger than this number, a random sample of max_so_pairs_size is used. Default: 10 000. If the number of pairs to check is smaller than 10 000, no sampling is done. mp_pairs : bool If True, get the pairs to process using multiprocessing if larger than 10 000. Default: True. run_linear : bool If True, gather the data without multiprocessing. This option is good when debugging or if the environment for some reason does not support multiprocessing. Default: False. Returns ------- Results A BaseModel containing correlation data for different explanations """ if not self.corr_stats_axb: s3 = get_s3_client(unsigned=False) try: corr_stats_loc = self.get_s3_corr_stats_path() if S3Path.from_string(corr_stats_loc).exists(s3): logger.info(f"Found corr stats data at {corr_stats_loc}") corr_stats_json = file_opener(corr_stats_loc) self.corr_stats_axb = Results(**corr_stats_json) else: logger.info(f"No corr stats data at found at " f"{corr_stats_loc}") except ValueError as ve: # Raised when s3 location is not set logger.warning(ve) # If not found on s3 or ValueError was raised if not self.corr_stats_axb: logger.info("Generating corr stats data") # Load correlation matrix if z_corr is None: z_corr = self.load_z_corr() if isinstance(z_corr, str): z_corr = self.load_z_corr(local_file_path=z_corr) # Load reactome if present try: reactome = self.load_reactome() except FileNotFoundError: logger.info("No reactome file used in script") reactome = None self.corr_stats_axb: Results = axb_stats( self.expl_df, self.stats_df, z_corr=z_corr, reactome=reactome, eval_str=False, max_proc=max_proc, max_corr_pairs=max_so_pairs_size, do_mp_pairs=mp_pairs, run_linear=run_linear, ) try: corr_stats_loc = self.get_s3_corr_stats_path() logger.info(f"Uploading corr stats to S3 at " f"{corr_stats_loc}") s3p_loc = S3Path.from_string(corr_stats_loc) s3p_loc.put(s3=s3, body=self.corr_stats_axb.json()) logger.info("Finished uploading corr stats to S3") except ValueError: logger.warning("Unable to upload corr stats to S3") else: logger.info("Data already present in corr_stats_axb") return self.corr_stats_axb
sup_links=list(sg.edges)) beliefs.update(calculate_belief(stmts)) group = set() return beliefs else: stmts = load_mock_statements(db) return calculate_belief(stmts) if __name__ == '__main__': parser = argparse.ArgumentParser(description='DB Belief Score Dumper') parser.add_argument('--fname', nargs='?', type=str, default='belief_dict.pkl', help='Filename of the belief dict output') parser.add_argument('-s3', action='store_true', default=False, help='Upload belief dict to the bigmech s3 bucket ' 'instead of saving it locally') args = parser.parse_args() belief_dict = get_belief() if args.s3: key = '/'.join([datetime.utcnow().strftime('%Y-%m-%d'), args.fname]) s3_path = S3Path(S3_SUBDIR, key) upload_pickle_to_s3(obj=belief_dict, s3_path=s3_path) else: with open(args.fname, 'wb') as f: pickle.dump(belief_dict, f)
class XddManager: bucket = S3Path(bucket='hms-uw-collaboration') reader_versions = {'REACH': '1.3.3-61059a-biores-e9ee36', 'SPARSER': 'February2020-linux'} indra_version = '1.16.0-c439fdbc936f4eac00cafd559927d7ee06c492e8' def __init__(self): self.groups = None self.statements = None self.text_content = None def load_groups(self, db): logger.info("Finding groups that have not been handled yet.") s3 = boto3.client('s3') groups = self.bucket.list_prefixes(s3) previous_groups = {s for s, in db.select_all(db.XddUpdates.day_str)} self.groups = [group for group in groups if group.key[:-1] not in previous_groups] return def load_statements(self, db): logger.info("Loading statements.") s3 = boto3.client('s3') self.statements = defaultdict(lambda: defaultdict(list)) self.text_content = {} for group in self.groups: logger.info(f"Processing {group.key}") file_pair_dict = _get_file_pairs_from_group(s3, group) for (run_id, id_src), (bibs, stmts) in file_pair_dict.items(): logger.info(f"Loading {run_id}") doi_lookup = {bib['_xddid']: bib['identifier'][0]['id'].upper() for bib in bibs if 'identifier' in bib} pub_lookup = {bib['_xddid']: bib['publisher'] for bib in bibs} dois = {doi for doi in doi_lookup.values()} trids = _get_trids_from_dois(db, dois) for sj in stmts: ev = sj['evidence'][0] xddid = ev['text_refs']['CONTENT_ID'] ev.pop('pmid', None) if xddid not in doi_lookup: logger.warning("Skipping statement because bib " "lacked a DOI.") continue ev['text_refs']['DOI'] = doi_lookup[xddid] trid = trids[doi_lookup[xddid]] ev['text_refs']['TRID'] = trid ev['text_refs']['XDD_RUN_ID'] = run_id ev['text_refs']['XDD_GROUP_ID'] = group.key self.statements[trid][ev['text_refs']['READER']].append(sj) if trid not in self.text_content: if id_src: src = f'xdd-{id_src}' else: src = 'xdd' self.text_content[trid] = \ (trid, src, 'xdd', 'fulltext', pub_lookup[xddid] == 'bioRxiv') return def dump_statements(self, db): tc_rows = set(self.text_content.values()) tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint') logger.info(f"Dumping {len(tc_rows)} text content.") db.copy_lazy('text_content', tc_rows, tc_cols) # Look up tcids for newly entered content. tcids = db.select_all( [db.TextContent.text_ref_id, db.TextContent.id], db.TextContent.text_ref_id.in_(self.statements.keys()), db.TextContent.format == 'xdd' ) tcid_lookup = {trid: tcid for trid, tcid in tcids} # Compile reading and statements into rows. r_rows = set() r_cols = ('id', 'text_content_id', 'reader', 'reader_version', 'format', 'batch_id') s_rows = set() rd_batch_id = db.make_copy_batch_id() stmt_batch_id = db.make_copy_batch_id() stmts = [] for trid, trid_set in self.statements.items(): for reader, stmt_list in trid_set.items(): tcid = tcid_lookup[trid] reader_version = self.reader_versions[reader.upper()] reading_id = generate_reading_id(tcid, reader, reader_version) r_rows.add((reading_id, tcid, reader.upper(), reader_version, 'xdd', rd_batch_id)) for sj in stmt_list: stmt = Statement._from_json(sj) stmts.append(stmt) sd = DatabaseStatementData( stmt, reading_id, indra_version=self.indra_version ) s_rows.add(sd.make_tuple(stmt_batch_id)) logger.info(f"Dumping {len(r_rows)} readings.") db.copy_lazy('reading', r_rows, r_cols, commit=False) logger.info(f"Dumping {len(s_rows)} raw statements.") db.copy_lazy('raw_statements', s_rows, DatabaseStatementData.get_cols(), commit=False) if len(stmts): insert_raw_agents(db, stmt_batch_id, stmts, verbose=False, commit=False) update_rows = [(json.dumps(self.reader_versions), self.indra_version, group.key[:-1]) for group in self.groups] db.copy('xdd_updates', update_rows, ('reader_versions', 'indra_version', 'day_str')) return def run(self, db): self.load_groups(db) self.load_statements(db) self.dump_statements(db)