def _get_feature_space_condition(feature_space): config = cfg.get_default_config() gene_table = config.tables.geneinfo CONDITION = ("rid in (SELECT CAST(gene_id AS STRING) " "FROM `{}` " "WHERE feature_space in UNNEST({}))").format( gene_table, _get_feature_list(feature_space)) return CONDITION
def cmap_genetic_perts(client, pert_id=None, cmap_name=None, gene_id=None, gene_title=None, ensemble_id=None, table=None, verbose=False): """ Query genetic_pertinfo table :param client: Bigquery Client :param pert_id: List of pert_ids :param cmap_name: List of cmap_names :param gene_id: List of type INTEGER corresponding to gene_ids :param gene_title: List of gene_titles :param ensemble_id: List of ensumble_ids :param table: table to query. This by default points to the siginfo table and normally should not be changed. :param verbose: Print query and table address. :return: """ if table is None: config = cfg.get_default_config() table = config.tables.genetic_pertinfo SELECT = "SELECT *" FROM = "FROM {}".format(table) CONDITIONS = [] if pert_id: pert_id = parse_condition(pert_id) CONDITIONS.append("pert_id in UNNEST({})".format(list(pert_id))) if cmap_name: cmap_name = parse_condition(cmap_name) CONDITIONS.append("cmap_name in UNNEST({})".format(list(cmap_name))) if gene_id: gene_id = parse_condition(gene_id) CONDITIONS.append("gene_id in UNNEST({})".format(list(gene_id))) if gene_title: gene_title = parse_condition(gene_title) CONDITIONS.append("gene_title in UNNEST({})".format(list(gene_title))) if ensemble_id: ensemble_id = parse_condition(ensemble_id) CONDITIONS.append("ensemble_id in UNNEST({})".format( list(ensemble_id))) if CONDITIONS: WHERE = "WHERE " + " AND ".join(CONDITIONS) else: WHERE = "" query = " ".join([SELECT, FROM, WHERE]) if verbose: print("Table: \n {}".format(table)) print("Query:\n {}".format(query)) return run_query(client, query).result().to_dataframe()
def list_tables(): """ Print table addresses. Comes from defaults in config. :return: None """ config = cfg.get_default_config() print(config.tables) return
def list_cmap_compounds(client): """ List available compounds :param client: BigQuery Client :return: Single column Dataframe of compounds """ config = cfg.get_default_config() compoundinfo_table = config.tables.compoundinfo QUERY = "SELECT DISTINCT cmap_name from {}".format(compoundinfo_table) return run_query(client, QUERY).result().to_dataframe()
def list_cmap_moas(client): """ List available MoAs :param client: BigQuery Client :return: Single column Dataframe of MoAs """ config = cfg.get_default_config() compoundinfo_table = config.tables.compoundinfo QUERY = ( 'SELECT moa, ' 'COUNT(DISTINCT(pert_id)) AS count ' 'FROM `{}` ' 'GROUP BY moa') QUERY = QUERY.format(compoundinfo_table) return run_query(client, QUERY).result().to_dataframe()
def list_cmap_targets(client): """ List available targets :param client: BigQuery Client :return: Pandas DataFrame """ config = cfg.get_default_config() compoundinfo_table = config.tables.compoundinfo QUERY = ( 'SELECT target, ' 'COUNT(DISTINCT(pert_id)) AS count ' 'FROM `{}` ' 'GROUP BY target') QUERY = QUERY.format(compoundinfo_table) return run_query(client, QUERY).result().to_dataframe()
def _get_numerical_table_id(table=None, data_level="level5", feature_space="landmark", rid=False): config = cfg.get_default_config() if table is not None: table_id = table elif feature_space == "landmark": if data_level == "level3": table_id = config.tables.level3_landmark elif data_level == "level4": table_id = config.tables.level4_landmark elif data_level == "level5": table_id = config.tables.level5_landmark else: print( "Unsupported data_level. select from ['level3', 'level4', level5'].\n Default is 'level5'. " ) raise ValueError else: if data_level == "level3": if rid: table_id = config.tables.level3_rid else: table_id = config.tables.level3 elif data_level == "level4": if rid: table_id = config.tables.level4_rid else: table_id = config.tables.level4 elif data_level == "level5": if rid: table_id = config.tables.level5_rid else: table_id = config.tables.level5 else: print( "Unsupported data_level. select from ['level3', 'level4', level5'].\n Default is 'level5'. " ) raise ValueError return table_id
def cmap_matrix( client, data_level="level5", feature_space="landmark", rid=None, cid=None, verbose=False, chunk_size=1000, table=None, limit=4000, ): """ Query for numerical data for signature-gene level data. :param client: Bigquery Client :param data_level: Data level requested. IDs from siginfo file correspond to 'level5'. Ids from instinfo are available in 'level3' and 'level4'. Choices are ['level5', 'level4', 'level3'] :param rid: Row ids :param cid: Column ids :param feature_space: Common featurespaces to extract. 'rid' overrides selection Choices: ['landmark', 'bing', 'aig'] landmark: 978 landmark genes bing: Best-inferred set of 10,174 genes aig: All inferred genes including 12,328 genes Default is landmark. :param chunk_size: Runs queries in stages to avoid query character limit. Default 1,000 :param limit: Soft limit for number of signatures allowed. Default is 4,000. :param table: Table address to query. Overrides 'data_level' parameter. Generally should not be used. :param verbose: Print query and table address. :return: GCToo object """ config = cfg.get_default_config() if table is not None: table_id = table else: if data_level == "level3": table_id = config.tables.level3 elif data_level == "level4": table_id = config.tables.level4 elif data_level == "level5": table_id = config.tables.level5 else: print( "Unsupported data_level. select from ['level3', 'level4', level5'].\n Default is 'level5'. " ) sys.exit(1) if cid: cid = parse_condition(cid) assert len(cid) <= limit, "List of cids can not exceed limit of {}".format( limit ) cur = 0 nparts = ceil(len(cid) / chunk_size) result_dfs = [] while cur < nparts: start = cur * chunk_size end = ( cur * chunk_size + chunk_size ) # No need to check for end, index only returns present values cur = cur + 1 print("Running query ... ({}/{})".format(cur, nparts)) result_dfs.append( _build_and_launch_query( client, table_id, rid=rid, cid=cid[start:end], feature_space=feature_space, verbose=verbose ) ) try: pool = mp.Pool(mp.cpu_count()) print("Pivoting Dataframes to GCT objects") result_gctoos = pool.map(_pivot_result, result_dfs) pool.close() except: if nparts > 1: print("Multiprocessing unavailable, pivoting chunks in series...") cur = 0 result_gctoos = [] for df in result_dfs: cur = cur + 1 print("Pivoting... ({}/{})".format(cur, nparts)) result_gctoos.append(_pivot_result(df)) print("Complete") return hstack(result_gctoos) else: print("Provide column ids to extract using the cid= keyword argument") sys.exit(1)
def cmap_compounds( client, pert_id=None, cmap_name=None, moa=None, target=None, compound_aliases=None, limit=None, verbose=False, ): """ Query compoundinfo table for various field by providing lists of compounds, moa, targets, etc. 'AND' operator used for multiple conditions. :param client: BigQuery Client :param pert_id: List of pert_ids :param cmap_name: List of cmap_names :param target: List of targets :param moa: List of MoAs :param compound_aliases: List of compound aliases :param limit: Maximum number of rows to return :param verbose: Print query and table address. :return: Pandas Dataframe matching queries """ config = cfg.get_default_config() compoundinfo_table = config.tables.compoundinfo SELECT = "SELECT *" FROM = "FROM {}".format(compoundinfo_table) CONDITIONS = [] if pert_id: pert_id = parse_condition(pert_id) CONDITIONS.append("pert_id in UNNEST({})".format(list(pert_id))) if cmap_name: cmap_name = parse_condition(cmap_name) CONDITIONS.append("cmap_name in UNNEST({})".format(list(cmap_name))) if target: target = parse_condition(target) CONDITIONS.append("target in UNNEST({})".format(list(target))) if moa: moa = parse_condition(moa) CONDITIONS.append("moa in UNNEST({})".format(list(moa))) if compound_aliases: compound_aliases = parse_condition(compound_aliases) CONDITIONS.append( "compound_aliases in UNNEST({})".format(list(compound_aliases)) ) if CONDITIONS: WHERE = "WHERE " + " AND ".join(CONDITIONS) else: WHERE = "" if limit: assert isinstance(limit, int), "Limit argument must be an integer" WHERE = WHERE + " LIMIT {}".format(limit) query = " ".join([SELECT, FROM, WHERE]) if verbose: print("Table: \n {}".format(compoundinfo_table)) print("Query:\n {}".format(query)) return run_query(client, query).result().to_dataframe()
def cmap_profiles( client, sample_id=None, pert_id=None, pert_itime=None, pert_idose=None, pert_type=None, cmap_name=None, cell_iname=None, det_plate=None, build_name=None, project_code=None, return_fields='priority', limit=None, table=None, verbose=False, ): """ Query per sample metadata, corresponds to level 3 and level 4 data, AND operator used for multiple conditions. :param client: Bigquery client :param sample_id: list of sample_ids :param pert_id: list of pert_ids :param pert_itime: list of timepoints :param pert_idose: list of doses :param pert_type: list of pert_types. Avoid using only this parameter as the return could be very large. :param cmap_name: list of cmap_names :param det_plate: list of det_plates :param build_name: list of builds :param project_code: list of project_codes :param return_fields: ['priority', 'all'] :param limit: Maximum number of rows to return :param table: table to query. This by default points to the siginfo table and normally should not be changed. :param verbose: Print query and table address. :return: Pandas Dataframe """ if table is None: config = cfg.get_default_config() table = config.tables.instinfo priority_fields = ['sample_id', 'det_plate', 'pert_id', 'cmap_name', 'pert_type', 'cell_iname', 'pert_itime', 'pert_idose', 'det_plate', 'build_name', 'project_code'] if return_fields == 'priority': SELECT = "SELECT " + ",".join(priority_fields) elif return_fields == 'all': SELECT = "SELECT *" else: print("return_fields only takes ['priority', 'all']") sys.exit(1) FROM = "FROM {}".format(table) CONDITIONS = [] if pert_id: pert_id = parse_condition(pert_id) CONDITIONS.append("pert_id in UNNEST({})".format(list(pert_id))) if pert_itime: pert_itime = parse_condition(pert_itime) CONDITIONS.append("pert_itime in UNNEST({})".format(list(pert_itime))) if pert_idose: pert_idose = parse_condition(pert_idose) CONDITIONS.append("pert_idose in UNNEST({})".format(list(pert_idose))) if pert_type: pert_type = parse_condition(pert_type) CONDITIONS.append("pert_type in UNNEST({})".format(list(pert_type))) if sample_id: sample_id = parse_condition(sample_id) CONDITIONS.append("sample_id in UNNEST({})".format(list(sample_id))) if cell_iname: cell_iname = parse_condition(cell_iname) CONDITIONS.append("cell_iname in UNNEST({})".format(list(cell_iname))) if cmap_name: cmap_name = parse_condition(cmap_name) CONDITIONS.append("cmap_name in UNNEST({})".format(list(cmap_name))) if det_plate: det_plate = parse_condition(det_plate) CONDITIONS.append("det_plate in UNNEST({})".format(list(det_plate))) if build_name: build_name = parse_condition(build_name) CONDITIONS.append("build_name in UNNEST({})".format(list(build_name))) if project_code: project_code = parse_condition(project_code) CONDITIONS.append("project_code in UNNEST({})".format(list(project_code))) if CONDITIONS: WHERE = "WHERE " + " AND ".join(CONDITIONS) else: WHERE = "" if limit: assert isinstance(limit, int), "Limit argument must be an integer" WHERE = WHERE + " LIMIT {}".format(limit) query = " ".join([SELECT, FROM, WHERE]) assert ( len(query) < 1024 * 10 ** 3 ), "Query length exceeds maximum allowed by BQ, keep under 1M characters" if verbose: print("Table: \n {}".format(table)) print("Query:\n {}".format(query)) return run_query(client, query).result().to_dataframe()
def cmap_sig( client, sig_id=None, pert_id=None, pert_itime=None, pert_idose=None, pert_type=None, cmap_name=None, cell_iname=None, det_plates=None, build_name=None, project_code=None, return_fields='priority', limit=None, table=None, verbose=False, ): """ Query level 5 metadata table. Multiple parameters are filtered using the 'AND' operator :param client: Bigquery Client :param sig_id: list of sig_ids :param pert_id: list of pert_ids :param pert_itime: list of timepoints :param pert_idose: list of doses :param pert_type: list of pert_types. Avoid using only this parameter as the return could be very large. :param cmap_name: list of cmap_name, formerly pert_iname :param cell_iname: list of cell names :param det_plates: list of det_plates. det_plates values are the concatenation of values from instinfo det_plate field with the '|' delimiter used. :param build_name: list of builds :param project_code: list of project_codes :param return_fields: ['priority', 'all'] :param limit: Maximum number of rows to return :param table: table to query. This by default points to the level 5 siginfo table and normally should not be changed. :param verbose: Print query and table address. :return: Pandas Dataframe """ priority_fields = ['sig_id', 'pert_id', 'cmap_name', 'pert_type', 'cell_iname', 'pert_itime', 'pert_idose', 'nsample', 'det_plates', 'build_name', 'project_code', 'ss_ngene', 'cc_q75', 'tas'] if return_fields == 'priority': SELECT = "SELECT " + ",".join(priority_fields) elif return_fields == 'all': SELECT = "SELECT *" else: print("return_fields only takes ['priority', 'all']") sys.exit(1) if table is None: config = cfg.get_default_config() table = config.tables.siginfo FROM = "FROM {}".format(table) CONDITIONS = [] if pert_id: pert_id = parse_condition(pert_id) CONDITIONS.append("pert_id in UNNEST({})".format(list(pert_id))) if pert_itime: pert_itime = parse_condition(pert_itime) CONDITIONS.append("pert_itime in UNNEST({})".format(list(pert_itime))) if pert_idose: pert_idose = parse_condition(pert_idose) CONDITIONS.append("pert_idose in UNNEST({})".format(list(pert_idose))) if pert_type: pert_type = parse_condition(pert_type) CONDITIONS.append("pert_type in UNNEST({})".format(list(pert_type))) if sig_id: sig_id = parse_condition(sig_id) CONDITIONS.append("sig_id in UNNEST({})".format(list(sig_id))) if cell_iname: cell_iname = parse_condition(cell_iname) CONDITIONS.append("cell_iname in UNNEST({})".format(list(cell_iname))) if cmap_name: cmap_name = parse_condition(cmap_name) CONDITIONS.append("cmap_name in UNNEST({})".format(list(cmap_name))) if det_plates: det_plates = parse_condition(det_plates) CONDITIONS.append("det_plates in UNNEST({})".format(list(det_plates))) if build_name: build_name = parse_condition(build_name) CONDITIONS.append("build_name in UNNEST({})".format(list(build_name))) if project_code: project_code = parse_condition(project_code) CONDITIONS.append("project_code in UNNEST({})".format(list(project_code))) if CONDITIONS: WHERE = "WHERE " + " AND ".join(CONDITIONS) else: WHERE = "" if limit: assert isinstance(limit, int), "Limit argument must be an integer" WHERE = WHERE + " LIMIT {}".format(limit) query = " ".join([SELECT, FROM, WHERE]) if verbose: print("Table: \n {}".format(table)) print("Query:\n {}".format(query)) return run_query(client, query).result().to_dataframe()
def cmap_genes(client, gene_id=None, gene_symbol=None, ensembl_id=None, gene_title = None, gene_type=None, feature_space="aig", src=None, table=None, verbose=False): """ Query geneinfo table. Geneinfo contains information about genes including ids, symbols, types, ensembl_ids, etc. :param client: Bigquery Client :param gene_id: list of gene_ids :param gene_symbol: list of gene_symbols :param ensembl_id: list of ensembl_ids :param gene_title: list of gene_titles :param gene_type: list of gene_types :param feature_space: Common featurespaces to extract. 'rid' overrides selection Choices: ['landmark', 'bing', 'aig'] landmark: 978 landmark genes bing: Best-inferred set of 10,174 genes aig: All inferred genes including 12,328 genes Default is aig. :param src: list of gene sources :param table: table to query. This by default points to the siginfo table and normally should not be changed. :param verbose: Print query and table address. :return: Pandas DataFrame """ if table is None: config = cfg.get_default_config() table = config.tables.geneinfo SELECT = "SELECT *" FROM = "FROM {}".format(table) CONDITIONS = [] if gene_id: gene_id = parse_condition(gene_id) CONDITIONS.append("gene_id in UNNEST({})".format(list(gene_id))) if gene_symbol: gene_symbol = parse_condition(gene_symbol) CONDITIONS.append("gene_symbol in UNNEST({})".format(list(gene_symbol))) if ensembl_id: ensembl_id = parse_condition(ensembl_id) CONDITIONS.append("ensembl_id in UNNEST({})".format(list(ensembl_id))) if gene_title: gene_title = parse_condition(gene_title) CONDITIONS.append("gene_title in UNNEST({})".format(list(gene_title))) if gene_type: gene_type = parse_condition(gene_type) CONDITIONS.append("gene_type in UNNEST({})".format(list(gene_type))) if feature_space: CONDITIONS.append("feature_space in UNNEST({})".format(list(_get_feature_list(feature_space)))) if CONDITIONS: WHERE = "WHERE " + " AND ".join(CONDITIONS) else: WHERE = "" query = " ".join([SELECT, FROM, WHERE]) if verbose: print("Table: \n {}".format(table)) print("Query:\n {}".format(query)) return run_query(client, query).result().to_dataframe()
def cmap_cell(client, cell_iname=None, cell_alias = None, ccle_name=None, primary_disease=None, cell_lineage=None, cell_type=None, table=None, verbose=False): """ Query cellinfo table :param client: Bigquery Client :param cell_iname: List of cell_inames :param cell_alias: List of cell aliases :param ccle_name: List of ccle_names :param primary_disease: List of primary_diseases :param cell_lineage: List of cell_lineages :param cell_type: List of cell_types :param table: table to query. This by default points to the siginfo table and normally should not be changed. :param verbose: Print query and table address. :return: Pandas DataFrame """ if table is None: config = cfg.get_default_config() table = config.tables.cellinfo SELECT = "SELECT *" FROM = "FROM {}".format(table) CONDITIONS = [] if cell_iname: cell_iname = parse_condition(cell_iname) CONDITIONS.append("cell_iname in UNNEST({})".format(list(cell_iname))) if cell_alias: cell_alias = parse_condition(cell_alias) CONDITIONS.append("cell_alias in UNNEST({})".format(list(cell_alias))) if ccle_name: ccle_name = parse_condition(ccle_name) CONDITIONS.append("ccle_name in UNNEST({})".format(list(ccle_name))) if primary_disease: primary_disease = parse_condition(primary_disease) CONDITIONS.append("primary_disease in UNNEST({})".format(list(primary_disease))) if cell_lineage: cell_lineage = parse_condition(cell_lineage) CONDITIONS.append("cell_lineage in UNNEST({})".format(list(cell_lineage))) if cell_type: cell_type = parse_condition(cell_type) CONDITIONS.append("cell_type in UNNEST({})".format(list(cell_type))) if CONDITIONS: WHERE = "WHERE " + " AND ".join(CONDITIONS) else: WHERE = "" query = " ".join([SELECT, FROM, WHERE]) if verbose: print("Table: \n {}".format(table)) print("Query:\n {}".format(query)) return run_query(client, query).result().to_dataframe()