def __init__(self): """Create new Zenodo Graph Repository object.""" super().__init__() self._data = { **compress_json.local_load("zenodo.json"), **self.load_wikidata_metatada() }
def get_available_graphs_from_repository(repository: str) -> List[str]: """Return list of available graphs from the given repositories. Parameters ---------------------- repository: str, The name of the repository to retrieve the graph from. Raises ---------------------- ValueError, If the given repository is not available. """ repositories = get_available_repositories() if not set_validator(repositories)(repository): raise ValueError(( "The provided repository `{}` is not within the set " "of supported repositories, {}.\n" "Did you mean `{}`?" ).format( repository, ", ".join(repositories), closest(repository, repositories) )) return compress_json.local_load("{repository}.json.gz".format( repository=repository )).keys()
def __init__(self): """Create new String Graph Repository object.""" super().__init__() # We load the data that cannot be automatically scraped self._data = compress_json.local_load("monarch_initiative.json") # The arguments keys used to load this graph general_kwargs = { "sources_column": "subject", "destinations_column": "object", "edge_list_edge_types_column": "predicate", "nodes_column": "id", "node_list_node_types_column": "category", "node_types_separator": "|", "name": "Monarch" } # We extend the data through scraping the Google Bucket base_url = "https://storage.googleapis.com/monarch-ingest/" xml = pd.read_xml(base_url).fillna("NaN") xml = xml[xml.Key.str.endswith("/monarch-kg.tar.gz")] for path in xml.Key: version = path.split("/")[0] self._data["Monarch"][version] = { "urls": [base_url + path], "arguments": { "edge_path": "monarch-kg/monarch-kg_edges.tsv", "node_path": "monarch-kg/monarch-kg_nodes.tsv", **general_kwargs } }
def train(epigenomes, labels, models, kwargs, region, cell_line): epigenomes = epigenomes[region].values labels = labels[region] splits = 10 holdouts = StratifiedShuffleSplit(n_splits=splits, test_size=0.2, random_state=42) if os.path.exists(cell_line + "/results_" + region + ".json"): results = compress_json.local_load(cell_line + "/results_" + region + ".json") else: results = [] for i, (train, test) in tqdm(enumerate(holdouts.split(epigenomes, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True): for model, params in tqdm(zip(models, kwargs), total=len(models), desc="Training models", leave=False, dynamic_ncols=True): model_name = (model.__class__.__name__ if model.__class__.__name__ != "Sequential" else model.name) if precomputed(results, model_name, i): continue model.fit(epigenomes[train], labels[train], **params) results.append({ "model": model_name, "run_type": "train", "holdout": i, **report(labels[train], model.predict(epigenomes[train])) }) results.append({ "model": model_name, "run_type": "test", "holdout": i, **report(labels[test], model.predict(epigenomes[test])) }) compress_json.local_dump( results, cell_line + "/results_" + region + ".json") df = pd.DataFrame(results) df = df.drop(columns=["holdout"]) return df
def last_db_time_get(): latest_file = max(glob.glob( '{}\\jsons\\DB_*.json.gz'.format(db_folder_path)), key=os.path.getctime) #выгружаем ее в переменную db_file = compress_json.local_load(latest_file) #если оказалось, что база выгружена текстом, а не диктом if not isinstance(db_file, dict): #конвертируем в дикт db_file = json.loads(db_file) #конвертируем ее время создания db_time = db_file["generation_timestamp"] db_conv_time = datetime.datetime.strptime(db_time, "%Y-%m-%dT%H:%M:%S.%fZ") return db_conv_time
def load_graph_data(self, graph_name: str) -> Dict: """Return the data stored for the provided graph. Parameters ----------------------- graph_name: str, Name of graph to retrieve data for. Returns ----------------------- The stored data for this graph. """ return compress_json.local_load( self.get_graph_data_path(graph_name) )
def is_normalized_metric(metric: str) -> bool: """Return boolean representing if given metric is known to be between 0 and 1. Parameters ---------- metric:str The metric to check for Returns ------- Boolean representing if given metric is known to be between 0 and 1. """ sanitized_metric = sanitize_ml_labels(metric) return any( candidate in sanitized_metric for candidate in compress_json.local_load("normalized_metrics.json"))
def fantom_available_cell_lines( root: str = "fantom", ) -> pd.DataFrame: """Return supported cell lines available within FANTOM dataset. Parameters --------------------------------------- root: str = "fantom", Where to store / load from the downloaded data. Returns --------------------------------------- Return dataframe with the supported cell lines mapped to FANTOM name. """ info = compress_json.local_load("fantom.json") path = f"{root}/cell_lines.tsv" download(info["cell_lines"], path, cache=True) df = pd.read_csv( path, sep="\t", header=None ) cell_lines_names = df[0].str.split("cell line:", expand=True) cell_lines_names[1][ cell_lines_names[0].str.startswith("H1") & cell_lines_names[0].str.contains("day00") ] = "H1" cell_lines_names[1][ cell_lines_names[0].str.startswith("H9") & cell_lines_names[0].str.contains("H9ES") ] = "H9" nan_mask = pd.notnull(cell_lines_names[1]) cell_lines_names = cell_lines_names[nan_mask] infected_mask = ~cell_lines_names[1].str.contains("infection") cell_lines_names = cell_lines_names[infected_mask] cell_lines_names[1] = cell_lines_names[1].str.split("/").str[0] cell_lines_names[1] = cell_lines_names[1].str.split(",").str[0] cell_lines_codes = pd.concat( objs=[ cell_lines_names[1].apply(lambda x: x.split("ENCODE")[ 0].strip()).str.upper().str.replace("-", ""), df[nan_mask][infected_mask][1], ], axis=1 ) cell_lines_codes.columns = ["cell_line", "code"] return cell_lines_codes.reset_index(drop=True).groupby("cell_line").first().reset_index()
def test_compress_json(): D = random_string_dict(10, 10) key = sha256(D) extensions = compress_json.compress_json._DEFAULT_EXTENSION_MAP.keys() for ext in extensions: path = f"random_dirs/test.json.{ext}" compress_json.dump(D, path) assert key == sha256(compress_json.load(path)) shutil.rmtree("random_dirs") for ext in extensions: path = f"random_dirs/test.json.{ext}" compress_json.local_dump(D, path) assert key == sha256(compress_json.local_load(path)) shutil.rmtree("tests/random_dirs")
def get_vend_data(db_file, vend_array_to_fill, buy_array_to_fill): """ Take db -FILE- and fill 2 arrays with its data Parameters ---------- db_file : filepath db file itself. vend_array_to_fill : list array that will hold value for "V" shops. buy_array_to_fill : list array that will hold value for "B" shops. Returns ------- None. """ #открываем файл с сжатой базой _shop = compress_json.local_load(db_file) if not isinstance(_shop, dict): #конвертируем в дикт _shop = json.loads(_shop) _shop = _shop['shops'] #with open(db_file, encoding="utf8") as vend_data_file: #делаем из него массив только с шопами # _shop = np.array(json.load(vend_data_file)['shops']) for _i in _shop: #закидываем всю эту ебалу друг за другом _t = { "owner": _i["owner"], "location": _i["location"]["map"], "creation_date": datetime.datetime.strptime(_i["creation_date"], "%Y-%m-%dT%H:%M:%SZ"), "items": _i["items"] } if _i['type'] == 'V': vend_array_to_fill.append(_t) else: buy_array_to_fill.append(_t)
def get_available_versions_from_graph_and_repository(name: str, repository: str) -> List[str]: """Return list of available graphs from the given repositories. Parameters ---------------------- name: str, The name of the graph to retrieve. repository: str, The name of the repository to retrieve the graph from. Raises ---------------------- ValueError, If the given repository is not available. """ return list(compress_json.local_load( "{}.json.gz".format(repository), use_cache=True )[name].keys())
def get_chembl_assays(self, start=0, end=100000, step=10000): """Get ChEMBL assays by querying the ChEMBL Assay Resource. Args: start: query start end: query end step: page size Returns: A list of ChEMBL assay records """ url = 'https://www.ebi.ac.uk/chembl/elk/es/chembl_27_assay/_search' query_data = compress_json.local_load('chembl_assay_query.json') query_end = self.estimate_records(url, query_data, start, end) output = open(os.path.join(self.input_base_dir, 'chembl_assay_records.json'), 'w') assays = [] for i in range(start, query_end, step): assays.extend(self.get_records(url, query_data, i, min(i+step, query_end))) json.dump(assays, output) return assays
def apply_replace_defaults(labels: List[str], custom_defaults: Dict[str, List[str]]) -> List[str]: """Return list of labels with replaced defaults.""" defaults = { **{ key: ["(?<![a-z]){}(?![a-z])".format(val) for val in values] for key, values in compress_json.local_load("labels.json").items() }, **custom_defaults } new_labels = [] for label in labels: replace_candidates = [] for default, targets in defaults.items(): for target in targets: regex = re.compile(target, re.IGNORECASE) matches = regex.findall(label) if bool(matches): for match in matches: replace_candidates.append((match, default)) # The following is required to avoid replacing substrings. replace_candidates = sorted(replace_candidates, key=lambda x: len(x[0]), reverse=False) replace_candidates = [(j, val) for i, (j, val) in enumerate(replace_candidates) if all(j not in k.lower() for _, k in replace_candidates[i + 1:])] replace_candidates = sorted(replace_candidates, key=lambda x: len(x[0]), reverse=True) for target, default in replace_candidates: label = label.replace(target, default) new_labels.append(label) return new_labels
def __init__(self, name: str, directed: bool = False, verbose: int = 2, cache_path: str = "graphs"): """Create new automatically retrieved graph. Parameters ------------------- name: str, The name of the graph to be retrieved and loaded. directed: bool = False, Wether to load the graph as directed or undirected. By default false. verbose: int = 2, Wether to show loading bars. cache_path: str = "graphs", Where to store the downloaded graphs. Raises ------------------- ValueError, If the given graph name is not available. """ graphs = compress_json.local_load("graphs.json") if name not in graphs: raise ValueError( ("Requested graph `{}` is not currently available.\n" "Open an issue on the EnsmallenGraph repository to ask " "for this graph to be added.").format(name)) self._graph = graphs[name] self._directed = directed self._name = name self._verbose = verbose self._cache_path = os.path.join(cache_path, name) self._downloader = BaseDownloader(auto_extract=True, target_directory=self._cache_path, verbose=self._verbose)
def roadmap_available_cell_lines(root: str) -> pd.DataFrame: """Return Roadmap supported available cell lines. Parameters --------------------------------------- root: str, Where to store / load from the downloaded data. Returns --------------------------------------- Return dataframe with the cell lines supported available in Roadmap dataset. """ info = compress_json.local_load("roadmap.json") filename = f"{root}/cell_lines.tsv" download(info["cell_lines"], filename, cache=True) cell_lines_codes = pd.read_csv(filename, sep="\t") cell_lines_codes = cell_lines_codes[ (cell_lines_codes.TYPE != "ESCDerived") & cell_lines_codes.GROUP.isin(["ENCODE2012", "ESC", "IMR90"])] cell_lines_codes["cell_line"] = cell_lines_codes.MNEMONIC.str.split( ".").str[1].str.replace("-", "") cell_lines_codes["code"] = cell_lines_codes.EID return cell_lines_codes[["cell_line", "code"]].reset_index(drop=True)
def run(self, data_file: Optional[str] = None) -> None: """Method is called and performs needed transformations to process protein-protein interactions from the STRING DB data. Args: data_file: data file to parse Returns: None. """ if not data_file: data_file = os.path.join(self.input_base_dir, "9606.protein.links.full.v11.0.txt.gz") os.makedirs(self.output_dir, exist_ok=True) protein_node_type = "biolink:Protein" edge_label = "biolink:interacts_with" self.node_header = compress_json.local_load("node_header.json") edge_core_header = compress_json.local_load("edge_core_header.json") edge_additional_headers = compress_json.local_load( "edge_additional_headers.json") self.edge_header = edge_core_header + edge_additional_headers relation = 'RO:0002434' seen_proteins: Set = set() seen_genes: Set = set() # Required to align the node edge header of the gene # with the default header self.extra_header = [""] * (len(edge_additional_headers) + 1) # make string ENSP to Uniprot id mapping dict string_to_uniprot_id_map = uniprot_make_name_to_id_mapping( os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING)) with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(data_file, 'rt') as interactions: node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(interactions.readline()) for line in interactions: items_dict = parse_stringdb_interactions(line, header_items) proteins = [] for protein_name in ('protein1', 'protein2'): nat_string_id = get_item_by_priority( items_dict, [protein_name]) protein = '.'.join(nat_string_id.split('.')[1:]) proteins.append(protein) if protein in self.protein_gene_map: gene = self.protein_gene_map[protein] if gene not in seen_genes: seen_genes.add(gene) ensemble_gene = f"ENSEMBL:{gene}" gene_informations = self.gene_info_map[ self.ensembl2ncbi_map[gene]] write_node_edge_item( fh=node, header=self.node_header, data=[ ensemble_gene, gene_informations['symbol'], 'biolink:Gene', gene_informations['description'], f"NCBIGene:{self.ensembl2ncbi_map[gene]}", self.source_name ]) write_node_edge_item( fh=edge, header=self.edge_header, data=[ ensemble_gene, "biolink:has_gene_product", f"ENSEMBL:{protein}", "RO:0002205", "NCBI", "" ] + self.extra_header) # write node data if protein not in seen_proteins: seen_proteins.add(protein) # if we have an equivalent Uniprot ID for this Ensembl protein # ID make an xref edge, and a node for the Uniprot ID uniprot_curie = '' if protein in string_to_uniprot_id_map: uniprot_curie = \ f"UniProtKB:{string_to_uniprot_id_map[protein]}" uniprot_curie = collapse_uniprot_curie( uniprot_curie) write_node_edge_item( fh=node, header=self.node_header, data=[ f"ENSEMBL:{protein}", "", protein_node_type, "", uniprot_curie, # xref self.source_name ]) # write edge data write_node_edge_item( fh=edge, header=self.edge_header, data=[ f"ENSEMBL:{proteins[0]}", edge_label, f"ENSEMBL:{proteins[1]}", relation, "STRING", "biolink:Association", items_dict['combined_score'] ] + [ items_dict.get(header, "") for header in edge_additional_headers ])
def __init__(self): """Create new FreeBase Graph Repository object.""" super().__init__() self._data = compress_json.local_load("freebase.json")
def train_sequence(epigenomes, labels, genome, cell_line, region, models): bed = epigenomes[region].reset_index()[epigenomes[region].index.names] splits = 2 holdouts = StratifiedShuffleSplit(n_splits=splits, test_size=0.2, random_state=42) if os.path.exists(cell_line + "/sequence_" + region + ".json"): results = compress_json.local_load(cell_line + "/sequence_" + region + ".json") else: results = [] for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels[region])), total=splits, desc="Computing holdouts", dynamic_ncols=True): train, test = get_holdout(train_index, test_index, bed, labels[region], genome) for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True): if precomputed(results, model.name, i): continue history = model.fit(train, steps_per_epoch=train.steps_per_epoch, validation_data=test, validation_steps=test.steps_per_epoch, epochs=100, shuffle=True, verbose=False, callbacks=[ EarlyStopping(monitor="val_loss", mode="min", patience=50), ]).history scores = pd.DataFrame(history).iloc[-1].to_dict() results.append({ "model": model.name, "run_type": "train", "holdout": i, **{ key: value for key, value in scores.items() if not key.startswith("val_") } }) results.append({ "model": model.name, "run_type": "test", "holdout": i, **{ key[4:]: value for key, value in scores.items() if key.startswith("val_") } }) compress_json.local_dump( results, cell_line + "/sequence_" + region + ".json") df = pd.DataFrame(results) df = df.drop(columns=["holdout"]) return df
def get_df(cell_line, data_type, region_type): results = compress_json.local_load(cell_line + "_" + region_type + "_" + data_type + ".json") df = pd.DataFrame(results).drop(columns="holdout") return df[(df.run_type == "test")]
def train_model_epi(models, epigenomes, nlabels, region_type, cell_line): # Reprod os.environ['PYTHONHASHSEED'] = '0' np.random.seed(42) y = nlabels[region_type].values.ravel() X = epigenomes[region_type] print("Num feature: " + str(X.shape[1])) splits = 51 holdouts = StratifiedShuffleSplit(n_splits=splits, test_size=0.2, random_state=42) class_w = class_weight.compute_class_weight('balanced', np.unique(y), y) class_w = dict(enumerate(class_w)) print("Class weights: " + str(class_w)) if os.path.exists(cell_line + "_" + region_type + "_epigenomic.json"): results = compress_json.local_load(cell_line + "_" + region_type + "_epigenomic.json") else: results = [] for i, (train, test) in tqdm(enumerate(holdouts.split(X, y)), total=splits, desc="Computing holdouts", dynamic_ncols=True): for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True): model_name = (model.__class__.__name__ if model.__class__.__name__ != "Sequential" else model.name) if precomputed(results, model_name, i): continue model.fit(X[train], y[train], epochs=1000, shuffle=True, verbose=False, validation_split=0.1, batch_size=1024, class_weight=class_w, callbacks=[ EarlyStopping(monitor="val_loss", mode="min", patience=50, restore_best_weights=True), ]) results.append({ "model": model_name, "run_type": "train", "holdout": i, **report(y[train], model.predict(X[train])) }) results.append({ "model": model_name, "run_type": "test", "holdout": i, **report(y[test], model.predict(X[test])) }) compress_json.local_dump( results, cell_line + "_" + region_type + "_epigenomic.json") df = pd.DataFrame(results) df = df.drop(columns=["holdout"]) return df
def __init__( self, name: str, version: str, repository: str, directed: bool = False, preprocess: Union[bool, str] = "auto", load_nodes: bool = True, load_node_types: bool = True, load_edge_weights: bool = True, auto_enable_tradeoffs: bool = True, sort_tmp_dir: Optional[str] = None, verbose: int = 2, cache: bool = True, cache_path: Optional[str] = None, cache_sys_var: str = "GRAPH_CACHE_DIR", graph_kwargs: Dict = None, hash_seed: str = None, callbacks: List[Callable] = (), callbacks_arguments: List[Dict] = (), ): """Create new automatically retrieved graph. Parameters ------------------- name: str The name of the graph to be retrieved and loaded. version: str The version of the graph to be retrieved. repository: str Name of the repository to load data from. directed: bool = False Whether to load the graph as directed or undirected. By default false. preprocess: Union[bool, str] = "auto" Whether to preprocess the node list and edge list to be loaded optimally in both time and memory. Will automatically preprocess in Linux and macOS and avoid doing this on Windows. load_nodes: bool = True Whether to load the nodes vocabulary or treat the nodes simply as a numeric range. This feature is only available when the preprocessing is enabled. load_node_types: bool = True Whether to load the node types if available or skip them entirely. This feature is only available when the preprocessing is enabled. load_edge_weights: bool = True Whether to load the edge weights if available or skip them entirely. This feature is only available when the preprocessing is enabled. auto_enable_tradeoffs: bool = True Whether to enable the Ensmallen time-memory tradeoffs in small graphs automatically. By default True, that is, if a graph has less than 50 million edges. In such use cases the memory expenditure is minimal. sort_tmp_dir: Optional[str] = None Which folder to use to store the temporary files needed to sort in parallel the edge list when building the optimal preprocessed file. This defaults to the same folder of the edge list when no value is provided. verbose: int = 2 Whether to show loading bars. cache: bool = True Whether to use cache, i.e. download files only once and preprocess them only once. cache_path: Optional[str] = None Where to store the downloaded graphs. If no path is provided, first we check the system variable provided below is set, otherwise we use the directory `graphs`. cache_sys_var: str = "GRAPH_CACHE_DIR" The system variable with the default graph cache directory. graph_kwargs: Dict = None Eventual additional kwargs for loading the graph. hash_seed: str = None Seed to use for the hash. callbacks: List[Callable] = () Eventual callbacks to call after download files. callbacks_arguments: List[Dict] = () Eventual arguments for callbacks. Raises ------------------- ValueError, If the given graph name is not available. ValueError, If the preprocess flag is provided but the system is Windows, which does not provide the sort command. """ try: validate_graph_version(name, repository, version) all_versions = compress_json.local_load( "{}.json.gz".format(repository) )[name] self._graph = all_versions[version] except KeyError: raise ValueError( ( "Requested graph `{}` is not currently available.\n" "Open an issue on the Graph repository to ask " "for this graph to be added." ).format(name) ) if preprocess == "auto": preprocess = is_macos() or is_linux() if preprocess and is_windows(): raise ValueError( "Currently preprocessing to optimal edge list is not supported " "on Windows because the sorting step is based upon the `sort` " "command, which is only available to our knowledge on Linux and " "macOS systems." ) # If the cache path was not provided # we either check the system variable # and if it is not set we use `graphs` if cache_path is None: cache_path = os.getenv(cache_sys_var, "graphs") cache_path = os.path.join(cache_path, repository) self._directed = directed self._preprocess = preprocess self._load_nodes = load_nodes self._load_node_types = load_node_types self._load_edge_weights = load_edge_weights self._name = name self._repository = repository self._version = version self._auto_enable_tradeoffs = auto_enable_tradeoffs self._sort_tmp_dir = sort_tmp_dir self._cache = cache self._verbose = verbose self._callbacks = callbacks if graph_kwargs is None: graph_kwargs = {} self._graph_kwargs = graph_kwargs self._callbacks_arguments = callbacks_arguments self._instance_hash = sha256({ "hash_seed": hash_seed, **self._graph, **self._graph_kwargs, }) self._cache_path = os.path.join( cache_path, name, version ) self._downloader = BaseDownloader( auto_extract=True, cache=cache, target_directory=self._cache_path, verbose=self._verbose, process_number=1 )
def train(sess, env, actor, critic, actor_noise, buffer_size, min_batch, ep): if "--save" in sys.argv: saver = tf.compat.v1.train.Saver() if "--load" in sys.argv: print("loading weights") loader = tf.compat.v1.train.Saver() arg_index = sys.argv.index("--load") save_name = sys.argv[arg_index + 1] loader.restore(sess, "savedir/" + save_name + "/save") print("weights loaded") else: sess.run(tf.compat.v1.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(buffer_size, 0) if "--loadBuff" in sys.argv: arg_index = sys.argv.index("--loadBuff") buffPath = sys.argv[arg_index + 1] print("loading buffer") tempBuff = compress_json.local_load("preTrain/" + buffPath + ".json.gz") nb = buffer_size / len(tempBuff["action"]) for i in range(int(nb)): for s, a, r, d, s1 in zip(tempBuff["state"], tempBuff["action"], tempBuff["reward"], tempBuff["done"], tempBuff["next_state"]): replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, d, np.reshape(s1, (actor.s_dim, ))) print("buffer loaded") max_episodes = ep max_steps = 200 score_list = [] tcostlist = [] tic = time.time() for i in range(max_episodes): state = env.reset() state = np.concatenate([state["observation"], state["desired_goal"]]) score = 0 cost = 0 costs = [] actor_noise.reset() if (i % 10 == 0): #print("serious:") explo = 0 else: explo = 1 for j in range(max_steps): if '--visu' in sys.argv: env.render() action = np.clip( actor.predict(np.reshape(state, (1, actor.s_dim))) + actor_noise() * explo, -1, 1) #print(action) next_state, reward, done, info = env.step(action.reshape(4, )) next_state = np.concatenate( [next_state["observation"], next_state["desired_goal"]]) replay_buffer.add(np.reshape(state, (actor.s_dim, )), np.reshape(action, (actor.a_dim, )), reward, done, np.reshape(next_state, (actor.s_dim, ))) # updating the network in batch if replay_buffer.size() < min_batch: continue states, actions, rewards, dones, next_states = replay_buffer.sample_batch( min_batch) target_q = critic.predict_target(next_states, actor.predict_target(next_states)) y = [] for k in range(min_batch): y.append(rewards[k] + critic.gamma * target_q[k] * (1 - dones[k])) # Update the critic given the targets predicted_q_value, _ = critic.train(states, actions, np.reshape(y, (min_batch, 1))) cost = y - predicted_q_value costs.append(cost) # Update the actor policy using the sampled gradient a_outs = actor.predict(states) grads = critic.action_gradients(states, a_outs) actor.train(states, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state score += reward tac = time.time() print("\033[0;1;4;97m", end='') print("Episode:", end="") print("\033[0;97m", end='') print(" {} ".format(i), end='') print("\033[3;4;91m", end='') print("temps total : {} secondes\r".format(int(tac - tic)), end='') if done: break tcost = np.mean(costs) tcostlist.append(tcost) score_list.append(score) if i % 10 == 0: print("\033[0;1;4;97m", end='') print("Episode:", end="") print("\033[0;97m", end='') print(" {} ".format(i)) print("total reward: {:.5} avg reward (last 10): {:.5}".format( score, np.mean(score_list[max(0, i - 10):(i + 1)]))) print("cost: {:.5} avg cost (last 10): {:.5}".format( tcost, np.mean(tcostlist[max(0, i - 10):(i + 1)]))) if "--save" in sys.argv: arg_index = sys.argv.index("--save") save_name = sys.argv[arg_index + 1] saver.save(sess, "savedir/" + save_name + "/save") print("\033[3;4;91m", end='') print("temps total : {} secondes".format(int(tac - tic))) return score_list
def roadmap(cell_lines: Union[List[str], str], window_size: int, genome: str = "hg19", root: str = "roadmap", states: int = 18, enhancers_labels: List[str] = ("7_Enh", "9_EnhA1", "10_EnhA2"), promoters_labels: List[str] = ("1_TssA", ), nrows: int = None): """Runs the pipeline over the roadmap raw data. Parameters ----------------------------- cell_lines: List[str], List of cell lines to be considered. window_size: int, Window size to use for the various regions. genome: str= "hg19", Considered genome version. Currently supported only "hg19". states: int = 18, Number of the states of the model to consider. Currently supported only "15" and "18". enhancers_labels: List[str] = ("7_Enh", "9_EnhA1", "10_EnhA2"), Labels to encode as active enhancers. promoters_labels: List[str] = ("1_TssA",), Labels to enode as active promoters nrows:int=None, the number of rows to read, usefull when testing pipelines for creating smaller datasets. Raises ------------------------------- ValueError: If given cell lines list is empty. ValueError: If given cell lines are not strings. ValueError: If given window size is not an integer. ValueError: If given window size is not a strictly positive integer. ValueError: If given genome version is not a string. ValueError: If given nrows parameter is not None or a strictly positive integer. ValueError: If the model with *states* states is not currently supported with given genome *genome*. Returns ------------------------------- Tuple containining dataframes informations for enhancers and promoters for chosen cell lines. """ info = compress_json.local_load("roadmap.json") validate_common_parameters(cell_lines, [window_size], genome, info) cell_lines = normalize_cell_lines(cell_lines) if str(states) not in info[genome]["states_model"]: raise ValueError( "The model with {states} states is not currently supported with given genome {genome}." .format(states=states, genome=genome)) cell_lines_names = filter_cell_lines( root, cell_lines, ) url = info[genome]["states_model"][str(states)] enhancers_list, promoters_list = list( zip(*[(enhancers, promoters) for cell_line, code in tqdm(cell_lines_names.values, desc="Cell lines") for enhancers, promoters in (get_cell_line( root, cell_line, states, genome, enhancers_labels, promoters_labels, url.format(code=code), nrows), ) if enhancers is not None and promoters is not None])) enhancers = pd.concat(enhancers_list, axis=1).fillna(0).astype( int) # Encode inactive enhancers as zeros promoters = pd.concat(promoters_list, axis=1).fillna(0).astype( int) # Encode inactive promoters as zeros # Adapt to given window size enhancers = enhancers.reset_index() promoters = promoters.reset_index() enhancers = center_window(enhancers, window_size) promoters = center_window(promoters, window_size) enhancers = normalize_bed_file(cell_lines, enhancers) promoters = normalize_bed_file(cell_lines, promoters) return enhancers, promoters
def run(self, data_file: str = None) -> None: """Method is called and performs needed transformations to process protein-protein interactions from the STRING DB data. Args: data_file: data file to parse Returns: None. """ if not data_file: data_file = os.path.join(self.input_base_dir, "9606.protein.links.full.v11.0.txt.gz") os.makedirs(self.output_dir, exist_ok=True) protein_node_type = "biolink:Protein" edge_label = "biolink:interacts_with" self.node_header = compress_json.local_load("node_header.json") edge_core_header = compress_json.local_load("edge_core_header.json") edge_additional_headers = compress_json.local_load( "edge_additional_headers.json") self.edge_header = edge_core_header + edge_additional_headers relation = 'RO:0002434' seen_proteins: Set = set() seen_genes: Set = set() # Required to align the node edge header of the gene # with the default header extra_header = [""] * (len(edge_additional_headers) + 1) with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(data_file, 'rt') as interactions: node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(interactions.readline()) for line in interactions: items_dict = parse_stringdb_interactions(line, header_items) proteins = [] for protein_name in ('protein1', 'protein2'): protein = get_item_by_priority(items_dict, [protein_name]) protein = '.'.join(protein.split('.')[1:]) proteins.append(protein) if protein in self.protein_gene_map: gene = self.protein_gene_map[protein] if gene not in seen_genes: seen_genes.add(gene) ensemble_gene = f"ENSEMBL:{gene}" gene_informations = self.gene_info_map[ self.ensembl2ncbi_map[gene]] write_node_edge_item( fh=node, header=self.node_header, data=[ ensemble_gene, gene_informations['symbol'], 'biolink:Gene', gene_informations['description'], f"NCBIGene:{self.ensembl2ncbi_map[gene]}" ]) write_node_edge_item( fh=edge, header=self.edge_header, data=[ ensemble_gene, "biolink:has_gene_product", protein, "RO:0002205", "NCBI", ] + extra_header) # write node data if protein not in seen_proteins: seen_proteins.add(protein) write_node_edge_item(fh=node, header=self.node_header, data=[ f"ENSEMBL:{protein}", "", protein_node_type, "", "" ]) # write edge data write_node_edge_item( fh=edge, header=self.edge_header, data=[ proteins[0], edge_label, proteins[1], relation, "STRING", items_dict['combined_score'] ] + [ items_dict.get(header, "") for header in edge_additional_headers ])
def sanitize_ml_labels( labels: Union[List[str], str], upper_case_consonants_clusters: bool = True, replace_with_spaces: List[str] = ("-", "_", ":", "<", ">"), detect_and_remove_homogeneous_descriptors: bool = True, replace_defaults: bool = True, soft_capitalization: bool = True, custom_defaults: Dict[str, Union[List[str], str]] = None) -> List[str]: """Return sanitized labels in standard way. Parameters ---------- labels: Union[List[str], str] Wither label or list of labels to sanitize. upper_case_consonants_clusters: bool = True Whetever to convert to upper case detected initials. replace_with_spaces: List[str] = ("-", "_", ":", "<", ">") Characters to be replaced with spaces. detect_and_remove_homogeneous_descriptors: bool = True Whetever to remove the known descriptors when all terms contain it. replace_defaults: bool = True Whetever to replace default terms. soft_capitalization: bool = True Whetever to apply soft capitalization, replacing capitalization only when no capitalization is already present. Returns ------- Sanitized labels. """ try: iter(labels) is_iterable = True except TypeError: is_iterable = False single_label = not is_iterable or isinstance(labels, str) if single_label: labels = [labels] labels = to_string(labels) if detect_and_remove_homogeneous_descriptors: generic_words_cooccurring_with_descriptors = compress_json.local_load( "generic_words_cooccurring_with_descriptors.json") for descriptor in compress_json.local_load("descriptors.json"): if have_descriptor(labels, descriptor, generic_words_cooccurring_with_descriptors): labels = remove_descriptor(labels, descriptor) if soft_capitalization: labels = apply_soft_capitalization(labels) if replace_defaults: if custom_defaults is None: custom_defaults = dict() custom_defaults = dict([(key, value) if isinstance(value, list) else (key, [value]) for key, value in custom_defaults.items()]) labels = apply_replace_defaults(labels, custom_defaults) labels = [ targets_to_spaces(label, replace_with_spaces) for label in labels ] labels = clear_spaces(labels) if soft_capitalization: labels = apply_soft_capitalization(labels) if upper_case_consonants_clusters: labels = [consonants_to_upper(label) for label in labels] if single_label: return labels[0] return labels
def __init__(self): """Create new JAX Graph Repository object.""" super().__init__() self._data = compress_json.local_load("jax.json")
def __init__(self): """Create new String Graph Repository object.""" super().__init__() self._data = compress_json.local_load("linqs.json")
def _load_unsupported_graphs(self) -> Set[str]: """Return set of known unsupported graphs.""" try: return compress_json.local_load(self.unsupported_graphs_path) except Exception: return list()
def main(): with tf.compat.v1.Session() as sess: tic = time.time() env = customEnv() if "--mstep" in sys.argv: arg_index = sys.argv.index("--mstep") micro_stepping = int(sys.argv[arg_index + 1]) else: micro_stepping = 1 if "--ep" in sys.argv: arg_index = sys.argv.index("--ep") ep = int(sys.argv[arg_index + 1]) else: ep = 10000 tau = 0.001 gamma = 0.99 min_batch = 64 actor_lr = 0.0001 critic_lr = 0.001 buffer_size = 1000000 layers = [300] state_dim = (env.observation_space["observation"].shape[0] + env.observation_space["desired_goal"].shape[0])*micro_stepping action_dim = env.action_space.shape[0] action_bound = env.action_space.high actor = ActorNetwork(sess, state_dim, action_dim, action_bound, layers, actor_lr, tau, min_batch) critic = CriticNetwork(sess, state_dim, action_dim, layers, critic_lr, tau, gamma, actor.get_num_trainable_vars()) action_wanted = tf.compat.v1.placeholder(tf.float32, (None, action_dim)) reward_wanted = tf.compat.v1.placeholder(tf.float32, (None, 1)) actor_target = tf.reduce_mean(tf.square(actor.out-action_wanted)) critic_target = tf.reduce_mean(tf.square(critic.out-reward_wanted)) actor_train = tf.compat.v1.train.AdamOptimizer(actor_lr).minimize(actor_target) critic_train = tf.compat.v1.train.AdamOptimizer(critic_lr).minimize(critic_target) update_target_network_actor = [actor.target_network_params[i].assign(actor.network_params[i]) for i in range(len(actor.target_network_params))] update_target_network_critic = [critic.target_network_params[i].assign(critic.network_params[i]) for i in range(len(critic.target_network_params))] print("\033[0;1;32m") print("===================") print("LE DEBUT") print("===================") print("loading buffer") arg_index = sys.argv.index("--loadBuff") buffPath = sys.argv[arg_index + 1] buffer = compress_json.local_load("preTrain/"+buffPath+".json.gz") print("buffer loaded") sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() i = 0 while i < ep: i += 1 states, actions, rewards = sample(buffer,min_batch) sess.run(actor_train,{actor.inputs: states, action_wanted: actions}) sess.run(critic_train,{critic.inputs: states, critic.action: actions, reward_wanted: np.reshape(rewards,(min_batch,1))}) print("\033[0;1;4;97m", end='') print("miniBatch {} / {}".format(i,ep), end='') print("\033[0;m ", end='') tac = time.time() print("\033[3;91m", end='') print("{} secondes".format(int(tac - tic)), end='') print("\033[0;m \r", end='') sess.run(update_target_network_actor) sess.run(update_target_network_critic) arg_index = sys.argv.index("--save") save_name = sys.argv[arg_index + 1] saver.save(sess, "savedir/" + save_name+"/save") print("\033[0;1;32m") print("session saved at : " + save_name) return 0
def train_model_seq(models, epigenomes, nlabels, region_type, cell_line): # Reprod os.environ['PYTHONHASHSEED'] = '0' np.random.seed(42) splits = 11 holdouts = StratifiedShuffleSplit( n_splits=splits, test_size=0.2, random_state=42) genome = Genome("hg19") bed = to_bed(epigenomes[region_type]) labels = nlabels[region_type].values.ravel() if os.path.exists(cell_line + "_" + region_type + "_sequence.json"): results = compress_json.local_load( cell_line + "_" + region_type + "_sequence.json") else: results = [] class_w = class_weight.compute_class_weight( 'balanced', np.unique(labels), labels) class_w = dict(enumerate(class_w)) print("Class weights: " + str(class_w)) for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True): train, test = get_holdout( train_index, test_index, bed, labels, genome, 1024) print("="*80) for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True): if precomputed(results, model.name, i): continue history = model.fit( train, steps_per_epoch=train.steps_per_epoch, validation_data=test, validation_steps=test.steps_per_epoch, epochs=1000, shuffle=True, verbose=False, class_weight=class_w, callbacks=[ EarlyStopping(monitor="val_loss", mode="min", patience=50, restore_best_weights=True), ] ).history scores = pd.DataFrame(history).iloc[-1].to_dict() results.append({ "model": model.name, "run_type": "train", "holdout": i, **{ key: value for key, value in scores.items() if not key.startswith("val_") } }) results.append({ "model": model.name, "run_type": "test", "holdout": i, **{ key[4:]: value for key, value in scores.items() if key.startswith("val_") } }) compress_json.local_dump( results, cell_line + "_" + region_type + "_sequence.json") df = pd.DataFrame(results).drop(columns="holdout") return df