def dump_sif(df_file=None, db_res_file=None, csv_file=None, src_count_file=None, reload=False, reconvert=True, ro=None): if ro is None: ro = get_db('primary') # Get the db content from a new DB dump or from file db_content = load_db_content(reload=reload, ns_list=NS_LIST, pkl_filename=db_res_file, ro=ro) # Convert the database query result into a set of pairwise relationships df = make_dataframe(pkl_filename=df_file, reconvert=reconvert, db_content=db_content) if csv_file: if isinstance(csv_file, str) and csv_file.startswith('s3:'): csv_file = S3Path.from_string(csv_file) # Aggregate rows by genes and stmt type logger.info("Saving to CSV...") filt_df = df.filter(items=[ 'agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name', 'stmt_type', 'evidence_count' ]) type_counts = filt_df.groupby(by=[ 'agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name', 'stmt_type' ]).sum() # This requires package s3fs under the hood. See: # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.20.0.html#s3-file-handling if isinstance(csv_file, S3Path): try: type_counts.to_csv(csv_file.to_string()) except Exception as e: try: logger.warning('Failed to upload csv to s3 using direct ' 's3 url, trying boto3: %s.' % e) s3 = get_s3_client(unsigned=False) csv_buf = StringIO() type_counts.to_csv(csv_buf) s3.put_object(Body=csv_buf.getvalue(), **csv_file.kw()) logger.info('Uploaded CSV file to s3') except Exception as second_e: logger.error('Failed to upload csv file with fallback ' 'method') logger.exception(second_e) # save locally else: type_counts.to_csv(csv_file) if src_count_file: _ = get_source_counts(src_count_file, ro=ro) return
def main(): args = get_parser().parse_args() ymd = args.s3_ymd if args.s3: logger.info('Uploading to %s/%s/%s on s3 instead of saving locally' % (S3_SIF_BUCKET, S3_SUBDIR, ymd)) db_res_file = _pseudo_key(args.db_dump, ymd) if args.s3 and args.db_dump\ else args.db_dump df_file = _pseudo_key(args.dataframe, ymd) if args.s3 and args.dataframe\ else args.dataframe csv_file = _pseudo_key(args.csv_file, ymd) if args.s3 and args.csv_file\ else args.csv_file src_count_file = _pseudo_key(args.src_counts, ymd) if args.s3 and \ args.src_counts else args.src_counts reload = args.reload if reload: logger.info('Reloading the database content from the database') else: logger.info('Loading cached database content from %s' % db_res_file) reconvert = args.reconvert if reconvert: logger.info('Reconverting database content into pandas dataframe') else: logger.info('Loading cached dataframe from %s' % df_file) for f in [db_res_file, df_file, csv_file, src_count_file]: if f: logger.info('Using file name %s' % f) else: continue dump_sif(df_file, db_res_file, csv_file, src_count_file, reload, reconvert, get_db('primary') if args.principal else get_ro('primary'))
def dump_sif(src_count_file, res_pos_file, belief_file, df_file=None, db_res_file=None, csv_file=None, reload=True, reconvert=True, ro=None, normalize_names: bool = True): """Build and dump a sif dataframe of PA statements with grounded agents Parameters ---------- src_count_file : Union[str, S3Path] A location to load the source count dict from. Can be local file path, an s3 url string or an S3Path instance. res_pos_file : Union[str, S3Path] A location to load the residue-postion dict from. Can be local file path, an s3 url string or an S3Path instance. belief_file : Union[str, S3Path] A location to load the belief dict from. Can be local file path, an s3 url string or an S3Path instance. df_file : Optional[Union[str, S3Path]] If provided, dump the sif to this location. Can be local file path, an s3 url string or an S3Path instance. db_res_file : Optional[Union[str, S3Path]] If provided, save the db content to this location. Can be local file path, an s3 url string or an S3Path instance. csv_file : Optional[str, S3Path] If provided, calculate dataframe statistics and save to local file or s3. Can be local file path, an s3 url string or an S3Path instance. reconvert : bool Whether to generate a new DataFrame from the database content or to load and return a DataFrame from `df_file`. If False, `df_file` must be given. Default: True. reload : bool If True, load new content from the database and make a new dataframe. If False, content can be loaded from provided files. Default: True. ro : Optional[PrincipalDatabaseManager] Provide a DatabaseManager to load database content from. If not provided, `get_ro('primary')` will be used. normalize_names : If True, detect and try to merge name duplicates (same entity with different names, e.g. Loratadin vs loratadin). Default: False """ def _load_file(path): if isinstance(path, str) and path.startswith('s3:') or \ isinstance(path, S3Path): if isinstance(path, str): s3path = S3Path.from_string(path) else: s3path = path if s3path.to_string().endswith('pkl'): return load_pickle_from_s3(s3path) elif s3path.to_string().endswith('json'): return load_json_from_s3(s3path) else: raise ValueError(f'Unknown file format of {path}') else: if path.endswith('pkl'): with open(path, 'rb') as f: return pickle.load(f) elif path.endswith('json'): with open(path, 'r') as f: return json.load(f) if ro is None: ro = get_db('primary') # Get the db content from a new DB dump or from file db_content = load_db_content(reload=reload, ns_list=NS_LIST, pkl_filename=db_res_file, ro=ro) # Load supporting files res_pos = _load_file(res_pos_file) src_count = _load_file(src_count_file) belief = _load_file(belief_file) # Convert the database query result into a set of pairwise relationships df = make_dataframe(pkl_filename=df_file, reconvert=reconvert, db_content=db_content, src_count_dict=src_count, res_pos_dict=res_pos, belief_dict=belief, normalize_names=normalize_names) if csv_file: if isinstance(csv_file, str) and csv_file.startswith('s3:'): csv_file = S3Path.from_string(csv_file) # Aggregate rows by genes and stmt type logger.info("Saving to CSV...") filt_df = df.filter(items=['agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name', 'stmt_type', 'evidence_count']) type_counts = filt_df.groupby(by=['agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name', 'stmt_type']).sum() # This requires package s3fs under the hood. See: # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.20.0.html#s3-file-handling if isinstance(csv_file, S3Path): try: type_counts.to_csv(csv_file.to_string()) except Exception as e: try: logger.warning('Failed to upload csv to s3 using direct ' 's3 url, trying boto3: %s.' % e) s3 = get_s3_client(unsigned=False) csv_buf = StringIO() type_counts.to_csv(csv_buf) csv_file.upload(s3, csv_buf) logger.info('Uploaded CSV file to s3') except Exception as second_e: logger.error('Failed to upload csv file with fallback ' 'method') logger.exception(second_e) # save locally else: type_counts.to_csv(csv_file) return
def get_pa_stmt_jsons(clauses=None, with_evidence=True, db=None, limit=1000): """Load preassembled Statements from the principal database.""" if db is None: db = get_db('primary') if clauses is None: clauses = [] # Construct the core query. if with_evidence: text_ref_cols = [ db.Reading.id, db.TextContent.id, db.TextRef.pmid, db.TextRef.pmcid, db.TextRef.doi, db.TextRef.url, db.TextRef.pii ] text_ref_types = tuple([ str if isinstance(col.type, String) else int for col in text_ref_cols ]) text_ref_cols = tuple([ cast(col, String) if not isinstance(col.type, String) else col for col in text_ref_cols ]) text_ref_labels = ('rid', 'tcid', 'pmid', 'pmcid', 'doi', 'url', 'pii') core_q = db.session.query( db.PAStatements.mk_hash.label('mk_hash'), db.PAStatements.json.label('json'), func.array_agg(db.RawStatements.json).label("raw_jsons"), func.array_agg(array(text_ref_cols)).label("text_refs") ).outerjoin( db.RawUniqueLinks, db.RawUniqueLinks.pa_stmt_mk_hash == db.PAStatements.mk_hash).join( db.RawStatements, db.RawStatements.id == db.RawUniqueLinks.raw_stmt_id ).outerjoin( db.Reading, db.Reading.id == db.RawStatements.reading_id).outerjoin( db.TextContent, db.TextContent.id == db.Reading.text_content_id).outerjoin( db.TextRef, db.TextRef.id == db.TextContent.text_ref_id) else: text_ref_types = None text_ref_labels = None core_q = db.session.query(db.PAStatements.mk_hash.label('mk_hash'), db.PAStatements.json.label('json'), null().label('raw_jsons'), null().label('text_refs')) core_q = core_q.filter(*clauses).group_by(db.PAStatements.mk_hash, db.PAStatements.json) if limit: core_q = core_q.limit(limit) core_sq = core_q.subquery().alias('core') # Construct the layer of the query that gathers agent info. agent_tuple = (cast(db.PAAgents.ag_num, String), db.PAAgents.db_name, db.PAAgents.db_id) at_sq = db.session.query( core_sq.c.mk_hash, core_sq.c.json, core_sq.c.raw_jsons, core_sq.c.text_refs, func.array_agg(array(agent_tuple)).label('db_refs')).filter( db.PAAgents.stmt_mk_hash == core_sq.c.mk_hash).group_by( core_sq.c.mk_hash, core_sq.c.json, core_sq.c.raw_jsons, core_sq.c.text_refs).subquery().alias('agent_tuples') # Construct the layer of the query that gathers supports/supported by. sup_from = aliased(db.PASupportLinks, name='sup_from') sup_to = aliased(db.PASupportLinks, name='sup_to') q = db.session.query( at_sq.c.mk_hash, at_sq.c.json, at_sq.c.raw_jsons, at_sq.c.text_refs, at_sq.c.db_refs, func.array_agg(sup_from.supporting_mk_hash).label('supporting_hashes'), func.array_agg( sup_to.supported_mk_hash).label('supported_hashes')).outerjoin( sup_from, sup_from.supported_mk_hash == at_sq.c.mk_hash).outerjoin( sup_to, sup_to.supporting_mk_hash == at_sq.c.mk_hash).group_by( at_sq.c.mk_hash, at_sq.c.json, at_sq.c.raw_jsons, at_sq.c.text_refs, at_sq.c.db_refs) # Run and parse the query. stmt_jsons = {} stmts_by_hash = {} for h, sj, rjs, text_refs, db_refs, supping, supped in q.all(): # Gather the agent refs. db_ref_dicts = defaultdict(lambda: defaultdict(list)) for ag_num, db_name, db_id in db_refs: db_ref_dicts[int(ag_num)][db_name].append(db_id) db_ref_dicts = {k: dict(v) for k, v in db_ref_dicts.items()} # Clean supping and supped. supping = [h for h in set(supping) if h is not None] supped = [h for h in set(supped) if h is not None] # Parse the JSON bytes into JSON. stmt_json = json.loads(sj) if 'supports' not in stmt_json: stmt_json['supports'] = [] if 'supported_by' not in stmt_json: stmt_json['supported_by'] = [] # Load the evidence. if rjs is not None: for rj, text_ref_values in zip(rjs, text_refs): raw_json = json.loads(rj) ev = raw_json['evidence'][0] if any(v is not None for v in text_ref_values): tr_dict = { lbl.upper(): None if val == "None" else typ(val) for lbl, typ, val in zip( text_ref_labels, text_ref_types, text_ref_values) } _fix_evidence(ev, tr_dict.pop('RID'), tr_dict.pop('TCID'), tr_dict) if 'evidence' not in stmt_json: stmt_json['evidence'] = [] stmt_json['evidence'].append(ev) # Resolve supports supported-by, as much as possible. stmts_by_hash[h] = stmt_json for supped_h in (h for h in supped if h in stmts_by_hash): stmt_json['supports'].append(stmts_by_hash[supped_h]['id']) stmts_by_hash[supped_h]['supported_by'].append(stmt_json['id']) for supping_h in (h for h in supping if h in stmts_by_hash): stmt_json['supported_by'].append(stmts_by_hash[supping_h]['id']) stmts_by_hash[supping_h]['supports'].append(stmt_json['id']) # Put it together in a dictionary. result_dict = { "mk_hash": h, "stmt": stmt_json, "db_refs": db_ref_dicts, "supports_hashes": supping, "supported_by_hashes": supped } stmt_jsons[h] = result_dict return stmt_jsons