Exemple #1
0
def get_dir_iter(path: str, file_ending: Optional[str] = None) -> List:
    """Takes a directory path and returns a list of files

    Parameters
    ----------
    path :
        The path to the directory to loop over
    file_ending :
        If provided, files in the returned list must be of this format,
        e.g. .pkl

    Returns
    -------
    :
        A list of file in the directory
    """
    if path.startswith('s3://'):
        s3 = get_s3_client(unsigned=False)
        s3_base_path = S3Path.from_string(path)
        input_iter = \
            [s3p.to_string() for s3p in s3_base_path.list_objects(s3)]
    else:
        local_base_path = Path(path)
        input_iter = [
            f.absolute().as_posix() for f in local_base_path.glob('*')
            if f.is_file()
        ]

    if file_ending:
        input_iter = [f for f in input_iter if f.endswith(file_ending)]

    return input_iter
Exemple #2
0
def dump_json_to_s3(name: str,
                    json_obj: Dict,
                    public: bool = False,
                    get_url: bool = False) -> Optional[str]:
    """Dumps a json object to S3

    Parameters
    ----------
    name :
        The file name to use for the uploaded file. Appropriate prefixes
        will be used.
    json_obj :
        The json object to upload
    public :
        If True allow public read access. Default: False.
    get_url :
        If True return the S3 url of the object

    Returns
    -------
    :
        Optionally return the S3 url of the json file
    """
    s3 = get_s3_client(unsigned=False)
    key = 'indra_network_search/' + name
    options = {'Bucket': DUMPS_BUCKET, 'Key': key}
    if public:
        options['ACL'] = 'public-read'
    s3.put_object(Body=json.dumps(json_obj), **options)
    if get_url:
        return s3.generate_presigned_url('get_object',
                                         Params={
                                             'Key': key,
                                             'Bucket': DUMPS_BUCKET
                                         })
Exemple #3
0
def dump_sif(df_file=None,
             db_res_file=None,
             csv_file=None,
             src_count_file=None,
             reload=False,
             reconvert=True,
             ro=None):
    if ro is None:
        ro = get_db('primary')

    # Get the db content from a new DB dump or from file
    db_content = load_db_content(reload=reload,
                                 ns_list=NS_LIST,
                                 pkl_filename=db_res_file,
                                 ro=ro)

    # Convert the database query result into a set of pairwise relationships
    df = make_dataframe(pkl_filename=df_file,
                        reconvert=reconvert,
                        db_content=db_content)

    if csv_file:
        if isinstance(csv_file, str) and csv_file.startswith('s3:'):
            csv_file = S3Path.from_string(csv_file)
        # Aggregate rows by genes and stmt type
        logger.info("Saving to CSV...")
        filt_df = df.filter(items=[
            'agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name',
            'stmt_type', 'evidence_count'
        ])
        type_counts = filt_df.groupby(by=[
            'agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name',
            'stmt_type'
        ]).sum()
        # This requires package s3fs under the hood. See:
        # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.20.0.html#s3-file-handling
        if isinstance(csv_file, S3Path):
            try:
                type_counts.to_csv(csv_file.to_string())
            except Exception as e:
                try:
                    logger.warning('Failed to upload csv to s3 using direct '
                                   's3 url, trying boto3: %s.' % e)
                    s3 = get_s3_client(unsigned=False)
                    csv_buf = StringIO()
                    type_counts.to_csv(csv_buf)
                    s3.put_object(Body=csv_buf.getvalue(), **csv_file.kw())
                    logger.info('Uploaded CSV file to s3')
                except Exception as second_e:
                    logger.error('Failed to upload csv file with fallback '
                                 'method')
                    logger.exception(second_e)
        # save locally
        else:
            type_counts.to_csv(csv_file)

    if src_count_file:
        _ = get_source_counts(src_count_file, ro=ro)
    return
Exemple #4
0
def load_pickle_from_s3(s3_path):
    logger.info('Loading pickle %s.' % s3_path)
    s3 = get_s3_client(False)
    try:
        res = s3_path.get(s3)
        obj = pickle.loads(res['Body'].read())
        logger.info('Finished loading %s.' % s3_path)
        return obj
    except Exception as e:
        logger.error('Failed to load %s.' % s3_path)
        logger.exception(e)
Exemple #5
0
def upload_pickle_to_s3(obj, s3_path):
    """Upload a python object as a pickle to s3"""
    logger.info('Uploading %s as pickle object to bucket %s'
                % (s3_path.key.split('/')[-1], s3_path.bucket))
    s3 = get_s3_client(unsigned=False)
    try:
        s3_path.upload(s3, pickle.dumps(obj))
        logger.info('Finished dumping file to s3')
    except Exception as e:
        logger.error('Failed to upload to s3')
        logger.exception(e)
Exemple #6
0
def load_json_from_s3(s3_path):
    """Helper to load json from s3"""
    logger.info(f'Loading json {s3_path} from s3.')
    s3 = get_s3_client(False)
    try:
        res = s3_path.get(s3)
        obj = json.loads(res['Body'].read().decode())
        logger.info(f'Finished loading {s3_path}.')
        return obj
    except Exception as e:
        logger.error(f'Failed to load {s3_path}.')
        logger.exception(e)
Exemple #7
0
def get_latest_pa_stmt_dump():
    s3_cli = get_s3_client(False)
    # Get file key
    dump_name = 'full_pa_stmts.pkl'
    file_tree = get_s3_file_tree(s3=s3_cli,
                                 bucket=DUMPS_BUCKET,
                                 prefix=DUMPS_PREFIX,
                                 with_dt=True)
    # Get all keys for dump_name
    keys = [key for key in file_tree.gets('key') if key[0].endswith(dump_name)]
    keys.sort(key=itemgetter(1))  # Sorts ascending by datetime

    return load_pickle_from_s3(s3_cli, keys[-1][0], DUMPS_BUCKET)
Exemple #8
0
def _upload_bytes_io_to_s3(bytes_io_obj: BytesIO, s3p: S3Path):
    """Upload a BytesIO object to s3

    Parameters
    ----------
    bytes_io_obj : BytesIO
        Object to upload
    s3p : S3Path
        An S3Path instance of the full upload url
    """
    logger.info(f"Uploading BytesIO object to s3: {str(s3p)}")
    bytes_io_obj.seek(0)  # Just in case
    s3 = get_s3_client(unsigned=False)
    s3p.put(body=bytes_io_obj, s3=s3)
Exemple #9
0
def dump_json_to_s3(name, json_obj, public=False, get_url=False):
    """Set public=True for public read access"""
    s3 = get_s3_client(unsigned=False)
    key = 'indra_network_search/' + name
    options = {'Bucket': DUMPS_BUCKET, 'Key': key}
    if public:
        options['ACL'] = 'public-read'
    s3.put_object(Body=json.dumps(json_obj), **options)
    if get_url:
        return s3.generate_presigned_url('get_object',
                                         Params={
                                             'Key': key,
                                             'Bucket': DUMPS_BUCKET
                                         })
Exemple #10
0
def get_latest_sif_s3(get_mesh_ids: bool = False) \
        -> Union[Tuple[Any, str], Tuple[Tuple[Any, str], Tuple[Any, str]]]:
    necc_files = [mngr.name for mngr in dumpers]
    if get_mesh_ids:
        necc_files.append(StatementHashMeshId.name)
    s3 = get_s3_client(unsigned=False)
    tree = get_s3_file_tree(s3,
                            bucket=DUMPS_BUCKET,
                            prefix=DUMPS_PREFIX,
                            with_dt=True)
    # Find all pickles and jsons
    keys = [
        key for key in tree.gets('key') if key[0].endswith(('.pkl', '.json'))
    ]
    # Sort newest first
    keys.sort(key=lambda t: t[1], reverse=True)
    # Get keys of those pickles
    keys_in_latest_dir = \
        [k[0] for k in keys if any(nfl in k[0] for nfl in necc_files)]
    # Map key to resource
    necc_keys = {}
    for n in necc_files:
        for k in keys_in_latest_dir:
            # check name then alt name
            if n in k:
                # Save and continue to next file in necc_files
                necc_keys[n] = k
                break
    logger.info(f'Latest files: {", ".join([f for f in necc_keys.values()])}')
    df = load_pickle_from_s3(s3, key=necc_keys[Sif.name], bucket=DUMPS_BUCKET)
    sif_date = _get_date_from_s3_key(necc_keys[Sif.name])
    if get_mesh_ids:
        mid = load_pickle_from_s3(s3,
                                  key=necc_keys[StatementHashMeshId.name],
                                  bucket=DUMPS_BUCKET)
        meshids_date = _get_date_from_s3_key(
            necc_keys[StatementHashMeshId.name])
        return (df, sif_date), (mid, meshids_date)

    return df, sif_date
Exemple #11
0
def dump_sif(src_count_file, res_pos_file, belief_file, df_file=None,
             db_res_file=None, csv_file=None, reload=True, reconvert=True,
             ro=None, normalize_names: bool = True):
    """Build and dump a sif dataframe of PA statements with grounded agents

    Parameters
    ----------
    src_count_file : Union[str, S3Path]
        A location to load the source count dict from. Can be local file
        path, an s3 url string or an S3Path instance.
    res_pos_file : Union[str, S3Path]
        A location to load the residue-postion dict from. Can be local file
        path, an s3 url string or an S3Path instance.
    belief_file : Union[str, S3Path]
        A location to load the belief dict from. Can be local file path,
        an s3 url string or an S3Path instance.
    df_file : Optional[Union[str, S3Path]]
        If provided, dump the sif to this location. Can be local file path,
        an s3 url string or an S3Path instance.
    db_res_file : Optional[Union[str, S3Path]]
        If provided, save the db content to this location. Can be local file
        path, an s3 url string or an S3Path instance.
    csv_file : Optional[str, S3Path]
        If provided, calculate dataframe statistics and save to local file
        or s3. Can be local file path, an s3 url string or an S3Path instance.
    reconvert : bool
        Whether to generate a new DataFrame from the database content or
        to load and return a DataFrame from `df_file`. If False, `df_file`
        must be given. Default: True.
    reload : bool
        If True, load new content from the database and make a new
        dataframe. If False, content can be loaded from provided files.
        Default: True.
    ro : Optional[PrincipalDatabaseManager]
        Provide a DatabaseManager to load database content from. If not
        provided, `get_ro('primary')` will be used.
    normalize_names :
        If True, detect and try to merge name duplicates (same entity with
        different names, e.g. Loratadin vs loratadin). Default: False
    """
    def _load_file(path):
        if isinstance(path, str) and path.startswith('s3:') or \
                isinstance(path, S3Path):
            if isinstance(path, str):
                s3path = S3Path.from_string(path)
            else:
                s3path = path
            if s3path.to_string().endswith('pkl'):
                return load_pickle_from_s3(s3path)
            elif s3path.to_string().endswith('json'):
                return load_json_from_s3(s3path)
            else:
                raise ValueError(f'Unknown file format of {path}')
        else:
            if path.endswith('pkl'):
                with open(path, 'rb') as f:
                    return pickle.load(f)
            elif path.endswith('json'):
                with open(path, 'r') as f:
                    return json.load(f)

    if ro is None:
        ro = get_db('primary')

    # Get the db content from a new DB dump or from file
    db_content = load_db_content(reload=reload, ns_list=NS_LIST,
                                 pkl_filename=db_res_file, ro=ro)

    # Load supporting files
    res_pos = _load_file(res_pos_file)
    src_count = _load_file(src_count_file)
    belief = _load_file(belief_file)

    # Convert the database query result into a set of pairwise relationships
    df = make_dataframe(pkl_filename=df_file, reconvert=reconvert,
                        db_content=db_content, src_count_dict=src_count,
                        res_pos_dict=res_pos, belief_dict=belief,
                        normalize_names=normalize_names)

    if csv_file:
        if isinstance(csv_file, str) and csv_file.startswith('s3:'):
            csv_file = S3Path.from_string(csv_file)
        # Aggregate rows by genes and stmt type
        logger.info("Saving to CSV...")
        filt_df = df.filter(items=['agA_ns', 'agA_id', 'agA_name',
                                   'agB_ns', 'agB_id', 'agB_name',
                                   'stmt_type', 'evidence_count'])
        type_counts = filt_df.groupby(by=['agA_ns', 'agA_id', 'agA_name',
                                          'agB_ns', 'agB_id', 'agB_name',
                                          'stmt_type']).sum()
        # This requires package s3fs under the hood. See:
        # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.20.0.html#s3-file-handling
        if isinstance(csv_file, S3Path):
            try:
                type_counts.to_csv(csv_file.to_string())
            except Exception as e:
                try:
                    logger.warning('Failed to upload csv to s3 using direct '
                                   's3 url, trying boto3: %s.' % e)
                    s3 = get_s3_client(unsigned=False)
                    csv_buf = StringIO()
                    type_counts.to_csv(csv_buf)
                    csv_file.upload(s3, csv_buf)
                    logger.info('Uploaded CSV file to s3')
                except Exception as second_e:
                    logger.error('Failed to upload csv file with fallback '
                                 'method')
                    logger.exception(second_e)
        # save locally
        else:
            type_counts.to_csv(csv_file)
    return
Exemple #12
0
    def get_corr_stats_axb(
        self,
        z_corr: Optional[Union[str, pd.DataFrame]] = None,
        max_proc: Optional[int] = None,
        max_so_pairs_size: int = 10000,
        mp_pairs: bool = True,
        run_linear: bool = False,
    ) -> Results:
        """Get statistics of the correlations from different explanation types

        Note: the provided options have no effect if the data is loaded
        from cache.

        Parameters
        ----------
        z_corr : Optional[Union[pd.DataFrame, str]]
            A pd.DataFrame containing the correlation z scores used to
            create the statistics in this object. Pro
        max_proc : int > 0
            The maximum number of processes to run in the multiprocessing
            in get_corr_stats_mp. Default: multiprocessing.cpu_count()
        max_so_pairs_size : int
            The maximum number of correlation pairs to process. If the
            number of eligible pairs is larger than this number, a random
            sample of max_so_pairs_size is used. Default: 10 000. If the
            number of pairs to check is smaller than 10 000, no sampling is
            done.
        mp_pairs : bool
            If True, get the pairs to process using multiprocessing if larger
            than 10 000. Default: True.
        run_linear : bool
            If True, gather the data without multiprocessing. This option is
            good when debugging or if the environment for some reason does
            not support multiprocessing. Default: False.

        Returns
        -------
        Results
            A BaseModel containing correlation data for different explanations
        """
        if not self.corr_stats_axb:
            s3 = get_s3_client(unsigned=False)
            try:
                corr_stats_loc = self.get_s3_corr_stats_path()
                if S3Path.from_string(corr_stats_loc).exists(s3):
                    logger.info(f"Found corr stats data at {corr_stats_loc}")
                    corr_stats_json = file_opener(corr_stats_loc)
                    self.corr_stats_axb = Results(**corr_stats_json)
                else:
                    logger.info(f"No corr stats data at found at "
                                f"{corr_stats_loc}")
            except ValueError as ve:
                # Raised when s3 location is not set
                logger.warning(ve)

            # If not found on s3 or ValueError was raised
            if not self.corr_stats_axb:
                logger.info("Generating corr stats data")
                # Load correlation matrix
                if z_corr is None:
                    z_corr = self.load_z_corr()
                if isinstance(z_corr, str):
                    z_corr = self.load_z_corr(local_file_path=z_corr)
                # Load reactome if present
                try:
                    reactome = self.load_reactome()
                except FileNotFoundError:
                    logger.info("No reactome file used in script")
                    reactome = None
                self.corr_stats_axb: Results = axb_stats(
                    self.expl_df,
                    self.stats_df,
                    z_corr=z_corr,
                    reactome=reactome,
                    eval_str=False,
                    max_proc=max_proc,
                    max_corr_pairs=max_so_pairs_size,
                    do_mp_pairs=mp_pairs,
                    run_linear=run_linear,
                )
                try:
                    corr_stats_loc = self.get_s3_corr_stats_path()
                    logger.info(f"Uploading corr stats to S3 at "
                                f"{corr_stats_loc}")
                    s3p_loc = S3Path.from_string(corr_stats_loc)
                    s3p_loc.put(s3=s3, body=self.corr_stats_axb.json())
                    logger.info("Finished uploading corr stats to S3")
                except ValueError:
                    logger.warning("Unable to upload corr stats to S3")
        else:
            logger.info("Data already present in corr_stats_axb")
        return self.corr_stats_axb
Exemple #13
0
def get_logs_from_s3(folder=None, cached=True, past_days=None):
    """Download logs from S3 and save into a local folder

    Parameters
    ----------
    folder : str
        The directory where to put the processed reports
    cached : str
        Look for already existing folders and skip those that exist
    past_days : int|datetime.datetime
        Either an integer or an instance of a datetime.datetime object
        specifying the number of days into the past to download logs for.
        If nothing is specified (default), all logs are downloaded.
        Default: None.

    Returns
    -------
    dir_set : set
        A set containing the dir paths of all the requested logs
    """
    s3 = get_s3_client(unsigned=False)
    if past_days:
        days_ago = past_days if isinstance(past_days, datetime) else\
            ((datetime.utcnow() - timedelta(days=past_days)).replace(
                tzinfo=timezone.utc) if isinstance(past_days, int) else None)
    else:
        days_ago = None
    tree = get_s3_file_tree(s3, 'cwc-hms', 'bob_ec2_logs', days_ago)
    keys = tree.gets('key')
    # Here we only get the tar.gz files which contain the logs for the
    # facilitator + the json file (if present) of the user data
    logger.info('Total number of objects: %d ' % len(keys))
    logger.info('Total number of images found: %d' %
                len([k for k in keys if 'image' in k]))
    keys = [key for key in keys if key.startswith('bob_ec2_logs/')
            and key.endswith(('.tar.gz', '.json', '.log'))]
    logger.info('Number of archives: %d' % len(keys))

    fname_patt = re.compile(
        '([\w:-]+?)_(\w+?)_(\w+?_\w+?)_(.*).(tar\.gz|json|\.log)'
    )
    dir_set = set()
    for key in tqdm.tqdm(keys):
        fname = os.path.basename(key)
        m = fname_patt.match(fname)
        if m is None:
            logger.warning("File name %s failed to match %s. Skipping..."
                           % (fname, fname_patt))
            continue
        image_id, cont_hash, cont_name, resource_name, suffix = m.groups()
        head_dir_path = '%s_%s_%s' % (image_id.replace(':', '-'), cont_name,
                                      cont_hash)
        dir_set.add(head_dir_path)
        if folder:
            head_dir_path = os.path.join(folder, head_dir_path)
        if not os.path.exists(head_dir_path):
            os.makedirs(head_dir_path, exist_ok=True)
        if resource_name == 'bioagent_images':
            outpath = head_dir_path
        else:
            outpath = os.path.join(head_dir_path, 'log.txt')
            if cached and os.path.exists(outpath) and\
                    not key.endswith(('.json', '.log')):
                continue
        tgz_file_name = key.split('/')[-1]
        tgz_file = os.path.join(head_dir_path, tgz_file_name)
        res = s3.get_object(Bucket='cwc-hms', Key=key)
        # byte_stream = BytesIO(res['Body'].read())
        byte_stream = res['Body'].read()
        with open(tgz_file, 'wb') as tf:
            tf.write(byte_stream)
        # Re-open file
        if tgz_file.endswith(('.json', '.log')):
            continue
        with open(tgz_file, 'rb') as file_byte_stream:
            with tarfile.open(None, 'r', fileobj=file_byte_stream) as tarf:
                if resource_name == 'bioagent_images':
                    tarf.extractall(outpath)
                else:
                    outpaths = tarf.getnames()
                    facls = [n for n in outpaths if
                             n.endswith('facilitator.log')]
                    if not facls:
                        logger.info('No facilitator.log found for %s' % key)
                        continue
                    facl = facls[0]
                    efo = tarf.extractfile(facl)
                    log_txt = efo.read().decode('utf-8')
                    with open(outpath, 'w') as fh:
                        fh.write(log_txt)
    return dir_set
Exemple #14
0
def dump_pickle_to_s3(name, pyobj, prefix=''):
    s3 = get_s3_client(unsigned=False)
    key = prefix + name
    key = key.replace('//', '/')
    s3.put_object(Bucket=NET_BUCKET, Key=key, Body=pickle.dumps(obj=pyobj))