def get_dir_iter(path: str, file_ending: Optional[str] = None) -> List: """Takes a directory path and returns a list of files Parameters ---------- path : The path to the directory to loop over file_ending : If provided, files in the returned list must be of this format, e.g. .pkl Returns ------- : A list of file in the directory """ if path.startswith('s3://'): s3 = get_s3_client(unsigned=False) s3_base_path = S3Path.from_string(path) input_iter = \ [s3p.to_string() for s3p in s3_base_path.list_objects(s3)] else: local_base_path = Path(path) input_iter = [ f.absolute().as_posix() for f in local_base_path.glob('*') if f.is_file() ] if file_ending: input_iter = [f for f in input_iter if f.endswith(file_ending)] return input_iter
def dump_json_to_s3(name: str, json_obj: Dict, public: bool = False, get_url: bool = False) -> Optional[str]: """Dumps a json object to S3 Parameters ---------- name : The file name to use for the uploaded file. Appropriate prefixes will be used. json_obj : The json object to upload public : If True allow public read access. Default: False. get_url : If True return the S3 url of the object Returns ------- : Optionally return the S3 url of the json file """ s3 = get_s3_client(unsigned=False) key = 'indra_network_search/' + name options = {'Bucket': DUMPS_BUCKET, 'Key': key} if public: options['ACL'] = 'public-read' s3.put_object(Body=json.dumps(json_obj), **options) if get_url: return s3.generate_presigned_url('get_object', Params={ 'Key': key, 'Bucket': DUMPS_BUCKET })
def dump_sif(df_file=None, db_res_file=None, csv_file=None, src_count_file=None, reload=False, reconvert=True, ro=None): if ro is None: ro = get_db('primary') # Get the db content from a new DB dump or from file db_content = load_db_content(reload=reload, ns_list=NS_LIST, pkl_filename=db_res_file, ro=ro) # Convert the database query result into a set of pairwise relationships df = make_dataframe(pkl_filename=df_file, reconvert=reconvert, db_content=db_content) if csv_file: if isinstance(csv_file, str) and csv_file.startswith('s3:'): csv_file = S3Path.from_string(csv_file) # Aggregate rows by genes and stmt type logger.info("Saving to CSV...") filt_df = df.filter(items=[ 'agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name', 'stmt_type', 'evidence_count' ]) type_counts = filt_df.groupby(by=[ 'agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name', 'stmt_type' ]).sum() # This requires package s3fs under the hood. See: # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.20.0.html#s3-file-handling if isinstance(csv_file, S3Path): try: type_counts.to_csv(csv_file.to_string()) except Exception as e: try: logger.warning('Failed to upload csv to s3 using direct ' 's3 url, trying boto3: %s.' % e) s3 = get_s3_client(unsigned=False) csv_buf = StringIO() type_counts.to_csv(csv_buf) s3.put_object(Body=csv_buf.getvalue(), **csv_file.kw()) logger.info('Uploaded CSV file to s3') except Exception as second_e: logger.error('Failed to upload csv file with fallback ' 'method') logger.exception(second_e) # save locally else: type_counts.to_csv(csv_file) if src_count_file: _ = get_source_counts(src_count_file, ro=ro) return
def load_pickle_from_s3(s3_path): logger.info('Loading pickle %s.' % s3_path) s3 = get_s3_client(False) try: res = s3_path.get(s3) obj = pickle.loads(res['Body'].read()) logger.info('Finished loading %s.' % s3_path) return obj except Exception as e: logger.error('Failed to load %s.' % s3_path) logger.exception(e)
def upload_pickle_to_s3(obj, s3_path): """Upload a python object as a pickle to s3""" logger.info('Uploading %s as pickle object to bucket %s' % (s3_path.key.split('/')[-1], s3_path.bucket)) s3 = get_s3_client(unsigned=False) try: s3_path.upload(s3, pickle.dumps(obj)) logger.info('Finished dumping file to s3') except Exception as e: logger.error('Failed to upload to s3') logger.exception(e)
def load_json_from_s3(s3_path): """Helper to load json from s3""" logger.info(f'Loading json {s3_path} from s3.') s3 = get_s3_client(False) try: res = s3_path.get(s3) obj = json.loads(res['Body'].read().decode()) logger.info(f'Finished loading {s3_path}.') return obj except Exception as e: logger.error(f'Failed to load {s3_path}.') logger.exception(e)
def get_latest_pa_stmt_dump(): s3_cli = get_s3_client(False) # Get file key dump_name = 'full_pa_stmts.pkl' file_tree = get_s3_file_tree(s3=s3_cli, bucket=DUMPS_BUCKET, prefix=DUMPS_PREFIX, with_dt=True) # Get all keys for dump_name keys = [key for key in file_tree.gets('key') if key[0].endswith(dump_name)] keys.sort(key=itemgetter(1)) # Sorts ascending by datetime return load_pickle_from_s3(s3_cli, keys[-1][0], DUMPS_BUCKET)
def _upload_bytes_io_to_s3(bytes_io_obj: BytesIO, s3p: S3Path): """Upload a BytesIO object to s3 Parameters ---------- bytes_io_obj : BytesIO Object to upload s3p : S3Path An S3Path instance of the full upload url """ logger.info(f"Uploading BytesIO object to s3: {str(s3p)}") bytes_io_obj.seek(0) # Just in case s3 = get_s3_client(unsigned=False) s3p.put(body=bytes_io_obj, s3=s3)
def dump_json_to_s3(name, json_obj, public=False, get_url=False): """Set public=True for public read access""" s3 = get_s3_client(unsigned=False) key = 'indra_network_search/' + name options = {'Bucket': DUMPS_BUCKET, 'Key': key} if public: options['ACL'] = 'public-read' s3.put_object(Body=json.dumps(json_obj), **options) if get_url: return s3.generate_presigned_url('get_object', Params={ 'Key': key, 'Bucket': DUMPS_BUCKET })
def get_latest_sif_s3(get_mesh_ids: bool = False) \ -> Union[Tuple[Any, str], Tuple[Tuple[Any, str], Tuple[Any, str]]]: necc_files = [mngr.name for mngr in dumpers] if get_mesh_ids: necc_files.append(StatementHashMeshId.name) s3 = get_s3_client(unsigned=False) tree = get_s3_file_tree(s3, bucket=DUMPS_BUCKET, prefix=DUMPS_PREFIX, with_dt=True) # Find all pickles and jsons keys = [ key for key in tree.gets('key') if key[0].endswith(('.pkl', '.json')) ] # Sort newest first keys.sort(key=lambda t: t[1], reverse=True) # Get keys of those pickles keys_in_latest_dir = \ [k[0] for k in keys if any(nfl in k[0] for nfl in necc_files)] # Map key to resource necc_keys = {} for n in necc_files: for k in keys_in_latest_dir: # check name then alt name if n in k: # Save and continue to next file in necc_files necc_keys[n] = k break logger.info(f'Latest files: {", ".join([f for f in necc_keys.values()])}') df = load_pickle_from_s3(s3, key=necc_keys[Sif.name], bucket=DUMPS_BUCKET) sif_date = _get_date_from_s3_key(necc_keys[Sif.name]) if get_mesh_ids: mid = load_pickle_from_s3(s3, key=necc_keys[StatementHashMeshId.name], bucket=DUMPS_BUCKET) meshids_date = _get_date_from_s3_key( necc_keys[StatementHashMeshId.name]) return (df, sif_date), (mid, meshids_date) return df, sif_date
def dump_sif(src_count_file, res_pos_file, belief_file, df_file=None, db_res_file=None, csv_file=None, reload=True, reconvert=True, ro=None, normalize_names: bool = True): """Build and dump a sif dataframe of PA statements with grounded agents Parameters ---------- src_count_file : Union[str, S3Path] A location to load the source count dict from. Can be local file path, an s3 url string or an S3Path instance. res_pos_file : Union[str, S3Path] A location to load the residue-postion dict from. Can be local file path, an s3 url string or an S3Path instance. belief_file : Union[str, S3Path] A location to load the belief dict from. Can be local file path, an s3 url string or an S3Path instance. df_file : Optional[Union[str, S3Path]] If provided, dump the sif to this location. Can be local file path, an s3 url string or an S3Path instance. db_res_file : Optional[Union[str, S3Path]] If provided, save the db content to this location. Can be local file path, an s3 url string or an S3Path instance. csv_file : Optional[str, S3Path] If provided, calculate dataframe statistics and save to local file or s3. Can be local file path, an s3 url string or an S3Path instance. reconvert : bool Whether to generate a new DataFrame from the database content or to load and return a DataFrame from `df_file`. If False, `df_file` must be given. Default: True. reload : bool If True, load new content from the database and make a new dataframe. If False, content can be loaded from provided files. Default: True. ro : Optional[PrincipalDatabaseManager] Provide a DatabaseManager to load database content from. If not provided, `get_ro('primary')` will be used. normalize_names : If True, detect and try to merge name duplicates (same entity with different names, e.g. Loratadin vs loratadin). Default: False """ def _load_file(path): if isinstance(path, str) and path.startswith('s3:') or \ isinstance(path, S3Path): if isinstance(path, str): s3path = S3Path.from_string(path) else: s3path = path if s3path.to_string().endswith('pkl'): return load_pickle_from_s3(s3path) elif s3path.to_string().endswith('json'): return load_json_from_s3(s3path) else: raise ValueError(f'Unknown file format of {path}') else: if path.endswith('pkl'): with open(path, 'rb') as f: return pickle.load(f) elif path.endswith('json'): with open(path, 'r') as f: return json.load(f) if ro is None: ro = get_db('primary') # Get the db content from a new DB dump or from file db_content = load_db_content(reload=reload, ns_list=NS_LIST, pkl_filename=db_res_file, ro=ro) # Load supporting files res_pos = _load_file(res_pos_file) src_count = _load_file(src_count_file) belief = _load_file(belief_file) # Convert the database query result into a set of pairwise relationships df = make_dataframe(pkl_filename=df_file, reconvert=reconvert, db_content=db_content, src_count_dict=src_count, res_pos_dict=res_pos, belief_dict=belief, normalize_names=normalize_names) if csv_file: if isinstance(csv_file, str) and csv_file.startswith('s3:'): csv_file = S3Path.from_string(csv_file) # Aggregate rows by genes and stmt type logger.info("Saving to CSV...") filt_df = df.filter(items=['agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name', 'stmt_type', 'evidence_count']) type_counts = filt_df.groupby(by=['agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name', 'stmt_type']).sum() # This requires package s3fs under the hood. See: # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.20.0.html#s3-file-handling if isinstance(csv_file, S3Path): try: type_counts.to_csv(csv_file.to_string()) except Exception as e: try: logger.warning('Failed to upload csv to s3 using direct ' 's3 url, trying boto3: %s.' % e) s3 = get_s3_client(unsigned=False) csv_buf = StringIO() type_counts.to_csv(csv_buf) csv_file.upload(s3, csv_buf) logger.info('Uploaded CSV file to s3') except Exception as second_e: logger.error('Failed to upload csv file with fallback ' 'method') logger.exception(second_e) # save locally else: type_counts.to_csv(csv_file) return
def get_corr_stats_axb( self, z_corr: Optional[Union[str, pd.DataFrame]] = None, max_proc: Optional[int] = None, max_so_pairs_size: int = 10000, mp_pairs: bool = True, run_linear: bool = False, ) -> Results: """Get statistics of the correlations from different explanation types Note: the provided options have no effect if the data is loaded from cache. Parameters ---------- z_corr : Optional[Union[pd.DataFrame, str]] A pd.DataFrame containing the correlation z scores used to create the statistics in this object. Pro max_proc : int > 0 The maximum number of processes to run in the multiprocessing in get_corr_stats_mp. Default: multiprocessing.cpu_count() max_so_pairs_size : int The maximum number of correlation pairs to process. If the number of eligible pairs is larger than this number, a random sample of max_so_pairs_size is used. Default: 10 000. If the number of pairs to check is smaller than 10 000, no sampling is done. mp_pairs : bool If True, get the pairs to process using multiprocessing if larger than 10 000. Default: True. run_linear : bool If True, gather the data without multiprocessing. This option is good when debugging or if the environment for some reason does not support multiprocessing. Default: False. Returns ------- Results A BaseModel containing correlation data for different explanations """ if not self.corr_stats_axb: s3 = get_s3_client(unsigned=False) try: corr_stats_loc = self.get_s3_corr_stats_path() if S3Path.from_string(corr_stats_loc).exists(s3): logger.info(f"Found corr stats data at {corr_stats_loc}") corr_stats_json = file_opener(corr_stats_loc) self.corr_stats_axb = Results(**corr_stats_json) else: logger.info(f"No corr stats data at found at " f"{corr_stats_loc}") except ValueError as ve: # Raised when s3 location is not set logger.warning(ve) # If not found on s3 or ValueError was raised if not self.corr_stats_axb: logger.info("Generating corr stats data") # Load correlation matrix if z_corr is None: z_corr = self.load_z_corr() if isinstance(z_corr, str): z_corr = self.load_z_corr(local_file_path=z_corr) # Load reactome if present try: reactome = self.load_reactome() except FileNotFoundError: logger.info("No reactome file used in script") reactome = None self.corr_stats_axb: Results = axb_stats( self.expl_df, self.stats_df, z_corr=z_corr, reactome=reactome, eval_str=False, max_proc=max_proc, max_corr_pairs=max_so_pairs_size, do_mp_pairs=mp_pairs, run_linear=run_linear, ) try: corr_stats_loc = self.get_s3_corr_stats_path() logger.info(f"Uploading corr stats to S3 at " f"{corr_stats_loc}") s3p_loc = S3Path.from_string(corr_stats_loc) s3p_loc.put(s3=s3, body=self.corr_stats_axb.json()) logger.info("Finished uploading corr stats to S3") except ValueError: logger.warning("Unable to upload corr stats to S3") else: logger.info("Data already present in corr_stats_axb") return self.corr_stats_axb
def get_logs_from_s3(folder=None, cached=True, past_days=None): """Download logs from S3 and save into a local folder Parameters ---------- folder : str The directory where to put the processed reports cached : str Look for already existing folders and skip those that exist past_days : int|datetime.datetime Either an integer or an instance of a datetime.datetime object specifying the number of days into the past to download logs for. If nothing is specified (default), all logs are downloaded. Default: None. Returns ------- dir_set : set A set containing the dir paths of all the requested logs """ s3 = get_s3_client(unsigned=False) if past_days: days_ago = past_days if isinstance(past_days, datetime) else\ ((datetime.utcnow() - timedelta(days=past_days)).replace( tzinfo=timezone.utc) if isinstance(past_days, int) else None) else: days_ago = None tree = get_s3_file_tree(s3, 'cwc-hms', 'bob_ec2_logs', days_ago) keys = tree.gets('key') # Here we only get the tar.gz files which contain the logs for the # facilitator + the json file (if present) of the user data logger.info('Total number of objects: %d ' % len(keys)) logger.info('Total number of images found: %d' % len([k for k in keys if 'image' in k])) keys = [key for key in keys if key.startswith('bob_ec2_logs/') and key.endswith(('.tar.gz', '.json', '.log'))] logger.info('Number of archives: %d' % len(keys)) fname_patt = re.compile( '([\w:-]+?)_(\w+?)_(\w+?_\w+?)_(.*).(tar\.gz|json|\.log)' ) dir_set = set() for key in tqdm.tqdm(keys): fname = os.path.basename(key) m = fname_patt.match(fname) if m is None: logger.warning("File name %s failed to match %s. Skipping..." % (fname, fname_patt)) continue image_id, cont_hash, cont_name, resource_name, suffix = m.groups() head_dir_path = '%s_%s_%s' % (image_id.replace(':', '-'), cont_name, cont_hash) dir_set.add(head_dir_path) if folder: head_dir_path = os.path.join(folder, head_dir_path) if not os.path.exists(head_dir_path): os.makedirs(head_dir_path, exist_ok=True) if resource_name == 'bioagent_images': outpath = head_dir_path else: outpath = os.path.join(head_dir_path, 'log.txt') if cached and os.path.exists(outpath) and\ not key.endswith(('.json', '.log')): continue tgz_file_name = key.split('/')[-1] tgz_file = os.path.join(head_dir_path, tgz_file_name) res = s3.get_object(Bucket='cwc-hms', Key=key) # byte_stream = BytesIO(res['Body'].read()) byte_stream = res['Body'].read() with open(tgz_file, 'wb') as tf: tf.write(byte_stream) # Re-open file if tgz_file.endswith(('.json', '.log')): continue with open(tgz_file, 'rb') as file_byte_stream: with tarfile.open(None, 'r', fileobj=file_byte_stream) as tarf: if resource_name == 'bioagent_images': tarf.extractall(outpath) else: outpaths = tarf.getnames() facls = [n for n in outpaths if n.endswith('facilitator.log')] if not facls: logger.info('No facilitator.log found for %s' % key) continue facl = facls[0] efo = tarf.extractfile(facl) log_txt = efo.read().decode('utf-8') with open(outpath, 'w') as fh: fh.write(log_txt) return dir_set
def dump_pickle_to_s3(name, pyobj, prefix=''): s3 = get_s3_client(unsigned=False) key = prefix + name key = key.replace('//', '/') s3.put_object(Bucket=NET_BUCKET, Key=key, Body=pickle.dumps(obj=pyobj))