def build_object(cls, **kwargs): location = "Coefficients" coefs_to_plot = [coef.upper() for coef in kwargs["coefs_to_plot"]] name = kwargs["name"] if "name" in kwargs else None ParameterValidator.assert_all_in_valid_list(coefs_to_plot, [item.name.upper() for item in CoefficientPlottingSetting], location, "coefs_to_plot") if CoefficientPlottingSetting.CUTOFF.name in coefs_to_plot: cutoff = kwargs["cutoff"] ParameterValidator.assert_type_and_value(cutoff, list, location, "cutoff") ParameterValidator.assert_all_type_and_value(cutoff, Number, location, "cutoff", min_inclusive=1e-15) else: cutoff = [] if CoefficientPlottingSetting.N_LARGEST.name in coefs_to_plot: n_largest = kwargs["n_largest"] ParameterValidator.assert_type_and_value(n_largest, list, location, "n_largest") ParameterValidator.assert_all_type_and_value(n_largest, int, location, "n_largest", min_inclusive=1) else: n_largest = [] coefs = CoefficientPlottingSettingList() for keyword in coefs_to_plot: coefs.append(CoefficientPlottingSetting[keyword.upper()]) return Coefficients(coefs, cutoff, n_largest, name)
def parse_object(specs, valid_class_names: list, class_name_ending: str, class_path: str, location: str, key: str, builder: bool = False, return_params_dict: bool = False): class_name = ObjectParser.get_class_name(specs, valid_class_names, class_name_ending, location, key) ParameterValidator.assert_in_valid_list(class_name, valid_class_names, location, key) cls = ReflectionHandler.get_class_by_name( f"{class_name}{class_name_ending}", class_path) params = ObjectParser.get_all_params(specs, class_path, class_name, key) try: if "name" not in inspect.signature(cls.__init__).parameters.keys(): del params["name"] obj = cls.build_object( **params) if builder and hasattr(cls, "build_object") else cls( **params) except TypeError as err: raise AssertionError( f"{location}: invalid parameter {err.args[0]} when specifying parameters in {specs} " f"under key {key}. Valid parameter names are: " f"{[name for name in inspect.signature(cls.__init__).parameters.keys()]}" ) return (obj, {class_name: params}) if return_params_dict else obj
def _prepare_specs(self): with open(self.yaml_path, "r") as file: specs = yaml.safe_load(file) ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], GalaxyTrainMLModel.__name__, "YAML specification") ParameterValidator.assert_all_in_valid_list( specs.keys(), ["definitions", "instructions", "output"], GalaxyTrainMLModel.__name__, "YAML specification") ParameterValidator.assert_type_and_value(specs["instructions"], dict, GalaxyTrainMLModel.__name__, "instructions") assert len(list(specs["instructions"].keys())) == 1, f"{GalaxyTrainMLModel.__name__}: one instruction has to be specified under " \ f"`instructions`, got the following instead: {list(specs['instructions'].keys())}." self.instruction_name = list(specs["instructions"].keys())[0] ParameterValidator.assert_type_and_value( specs['instructions'][self.instruction_name], dict, GalaxyTrainMLModel.__name__, self.instruction_name) ParameterValidator.assert_keys_present( specs['instructions'][self.instruction_name].keys(), ['type'], GalaxyTrainMLModel.__name__, self.instruction_name) assert specs['instructions'][self.instruction_name]['type'] == TrainMLModelInstruction.__name__[:-11], \ f"{GalaxyTrainMLModel.__name__}: instruction `type` under {self.instruction_name} has to be {TrainMLModelInstruction.__name__[:-11]} " \ f"for this tool." Util.check_paths(specs, GalaxyTrainMLModel.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def import_repertoire_dataset(import_class, params: DatasetImportParams, dataset_name: str) -> RepertoireDataset: """ Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file Arguments: import_class: class to use for import params: instance of DatasetImportParams class which includes information on path, columns, result path etc. dataset_name: user-defined name of the dataset Returns: RepertoireDataset object that was created """ metadata = pd.read_csv(params.metadata_file, ",") ParameterValidator.assert_keys_present(metadata.columns.tolist(), ["filename"], ImportHelper.__name__, f'{dataset_name}: params: metadata_file') PathBuilder.build(params.result_path + "repertoires/") arguments = [(import_class, row, params) for index, row in metadata.iterrows()] with Pool(params.number_of_processes) as pool: repertoires = pool.starmap(ImportHelper.load_repertoire_as_object, arguments) new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, params.result_path, dataset_name) potential_labels = list(set(metadata.columns.tolist()) - {"filename"}) dataset = RepertoireDataset(params={key: list(set(metadata[key].values.tolist())) for key in potential_labels}, repertoires=repertoires, metadata_file=new_metadata_file, name=dataset_name) PickleExporter.export(dataset, params.result_path) return dataset
def _prepare_report_config(self, instruction_key, instruction, split_key, symbol_table): if "reports" in instruction[split_key]: location = f"{instruction_key}/{split_key}/reports" report_types = list(signature(ReportConfig).parameters.keys()) ParameterValidator.assert_all_in_valid_list( instruction[split_key]["reports"].keys(), report_types, location, "reports") for report_type in instruction[split_key]["reports"]: ParameterValidator.assert_type_and_value( instruction[split_key]["reports"][report_type], list, f"{location}/{report_type}", report_type) report_config_input = { report_type: { report_id: symbol_table.get(report_id) for report_id in instruction[split_key]["reports"] [report_type] } for report_type in instruction[split_key]["reports"] } else: report_config_input = {} return report_config_input
def _parse_ml_method(ml_method_id: str, ml_specification) -> tuple: valid_class_values = ReflectionHandler.all_nonabstract_subclass_basic_names(MLMethod, "", "ml_methods/") if type(ml_specification) is str: ml_specification = {ml_specification: {}} ml_specification = {**DefaultParamsLoader.load("ml_methods/", "MLMethod"), **ml_specification} ml_specification_keys = list(ml_specification.keys()) ParameterValidator.assert_all_in_valid_list(list(ml_specification_keys), ["model_selection_cv", "model_selection_n_folds"] + valid_class_values, "MLParser", ml_method_id) non_default_keys = [key for key in ml_specification.keys() if key not in ["model_selection_cv", "model_selection_n_folds"]] assert len(ml_specification_keys) == 3, f"MLParser: ML method {ml_method_id} was not correctly specified. Expected at least 1 key " \ f"(ML method name), got {len(ml_specification_keys) - 2} instead: " \ f"{str([key for key in non_default_keys])[1:-1]}." ml_method_class_name = non_default_keys[0] ml_method_class = ReflectionHandler.get_class_by_name(ml_method_class_name, "ml_methods/") ml_specification[ml_method_class_name] = {**DefaultParamsLoader.load("ml_methods/", ml_method_class_name, log_if_missing=False), **ml_specification[ml_method_class_name]} method, params = MLParser.create_method_instance(ml_specification, ml_method_class, ml_method_id) ml_specification[ml_method_class_name] = params method.name = ml_method_id return method, ml_specification
def import_dataset(params, name: str) -> SequenceDataset: """ Returns randomly generated receptor dataset according to the parameters; YAML specification: result_path: path/where/to/store/results/ sequence_count: 100 # number of random sequences to generate chain_1_length_probabilities: 14: 0.8 # 80% of all generated sequences for all sequences will have length 14 15: 0.2 # 20% of all generated sequences across all sequences will have length 15 labels: epitope1: # label name True: 0.5 # 50% of the sequences will have class True False: 0.5 # 50% of the sequences will have class False epitope2: # next label with classes that will be assigned to sequences independently of the previous label or other parameters 1: 0.3 # 30% of the generated sequences will have class 1 0: 0.7 # 70% of the generated sequences will have class 0 """ valid_keys = [ "sequence_count", "length_probabilities", "labels", "result_path" ] ParameterValidator.assert_all_in_valid_list( list(params.keys()), valid_keys, "RandomSequenceDatasetImport", "params") return RandomDatasetGenerator.generate_sequence_dataset( sequence_count=params["sequence_count"], length_probabilities=params["length_probabilities"], labels=params["labels"], path=params["result_path"])
def _prepare_parameters(reference: dict, max_edit_distances: dict, name: str = None): location = "MatchedReceptorsEncoder" legal_chains = [ chain for receptor in (TCABReceptor(), TCGDReceptor(), BCReceptor()) for chain in receptor.get_chains() ] if type(max_edit_distances) is int: max_edit_distances = { chain: max_edit_distances for chain in legal_chains } elif type(max_edit_distances) is dict: ParameterValidator.assert_keys(max_edit_distances.keys(), legal_chains, location, "max_edit_distances", exclusive=False) else: ParameterValidator.assert_type_and_value(max_edit_distances, dict, location, 'max_edit_distances') reference_receptors = MatchedReferenceUtil.prepare_reference( reference, location=location, paired=True) return { "reference_receptors": reference_receptors, "max_edit_distances": max_edit_distances, "name": name }
def import_dataset(params: dict, dataset_name: str) -> RepertoireDataset: valid_keys = ["result_path", "repertoire_count", "sequence_count_probabilities", "sequence_length_probabilities", "labels"] ParameterValidator.assert_all_in_valid_list(list(params.keys()), valid_keys, "RandomRepertoireDatasetImport", "params") return RandomDatasetGenerator.generate_repertoire_dataset(repertoire_count=params["repertoire_count"], sequence_count_probabilities=params["sequence_count_probabilities"], sequence_length_probabilities=params["sequence_length_probabilities"], labels=params["labels"], path=params["result_path"])
def _check_specs(self, workflow_specification): location = 'MultiDatasetBenchmarkTool' ParameterValidator.assert_keys( workflow_specification.keys(), ['definitions', 'instructions', 'output'], location, 'YAML specification') self._check_dataset_specs(workflow_specification, location) self._check_instruction_specs(workflow_specification, location)
def _check_dataset(self, specs): ParameterValidator.assert_keys_present(specs["definitions"].keys(), ['datasets'], DatasetGenerationTool.__name__, 'definitions') assert len(specs['definitions']['datasets'].keys()) == 1, \ f"{DatasetGenerationTool.__name__}: only one dataset can be defined with this Galaxy tool, got these " \ f"instead: {list(specs['definitions']['datasets'].keys())}." assert len(specs['instructions'].keys()) == 1, \ f"{DatasetGenerationTool.__name__}: only one instruction of type DatasetExport can be defined with this Galaxy tool, got these " \ f"instructions instead: {list(specs['instructions'].keys())}."
def _check_label_format(self, labels: list, instruction_key: str): ParameterValidator.assert_type_and_value(labels, list, TrainMLModelParser.__name__, f'{instruction_key}/labels') assert all(isinstance(label, str) or isinstance(label, dict) for label in labels), \ f"{TrainMLModelParser.__name__}: labels under {instruction_key} were not defined properly. The list of labels has to either be a list of " \ f"label names, or there can be a parameter 'positive_class' defined under the label name." assert all(len(list(label.keys())) == 1 and isinstance(list(label.values())[0], dict) and 'positive_class' in list(label.values())[0] and len(list(list(label.values())[0].keys())) == 1 for label in [l for l in labels if isinstance(l, dict)]), \ f"{TrainMLModelParser.__name__}: labels that are specified by more than label name, can include only one parameter called 'positive_class'."
def parse(specs: dict, symbol_table: SymbolTable) -> dict: if "output" in specs: ParameterValidator.assert_keys(specs["output"], ["format"], "OutputParser", "output") ParameterValidator.assert_in_valid_list(specs["output"]["format"], ["HTML"], "OutputParser", "format") else: specs["output"] = {"format": "HTML"} symbol_table.add("output", SymbolType.OUTPUT, specs["output"]) return specs["output"]
def _prepare_parameters(max_edit_distance: int, reference: dict, name: str = None): location = "MatchedSequencesEncoder" ParameterValidator.assert_type_and_value(max_edit_distance, int, location, "max_edit_distance", min_inclusive=0) reference_sequences = MatchedReferenceUtil.prepare_reference(reference_params=reference, location=location, paired=False) return { "max_edit_distance": max_edit_distance, "reference_sequences": reference_sequences, "name": name }
def _check_instruction(self, specs): instruction_name = Util.check_instruction_type(specs, DatasetGenerationTool.__name__, DatasetExportInstruction.__name__[:-11]) for key in ['datasets', 'export_formats']: ParameterValidator.assert_keys_present(list(specs['instructions'][instruction_name].keys()), [key], DatasetGenerationTool.__name__, instruction_name) ParameterValidator.assert_type_and_value(specs["instructions"][instruction_name][key], list, DatasetGenerationTool.__name__, f"{instruction_name}/{key}") assert len(specs['instructions'][instruction_name][key]) == 1, \ f"{DatasetGenerationTool.__name__}: this tool accepts only one item under {key}, got {specs['instructions'][instruction_name][key]} " \ f"instead."
def update_specs(self): with open(self.yaml_path, 'r') as file: specs = yaml.safe_load(file) ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], DatasetGenerationTool.__name__, "YAML specification") ParameterValidator.assert_all_in_valid_list(specs.keys(), ["definitions", "instructions", "output"], DatasetGenerationTool.__name__, "YAML specification") self._check_dataset(specs) self._check_instruction(specs) Util.check_paths(specs, DatasetGenerationTool.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def _prepare_reports(self, reports: list, symbol_table: SymbolTable) -> dict: if reports is not None: report_objects = { report_id: symbol_table.get(report_id) for report_id in reports } ParameterValidator.assert_all_type_and_value( report_objects.values(), TrainMLModelReport, TrainMLModelParser.__name__, 'reports') return report_objects else: return {}
def check_instruction_type(specs: dict, tool_name, expected_instruction) -> str: ParameterValidator.assert_keys_present(list(specs.keys()), ['definitions', 'instructions'], tool_name, "YAML specification") assert len(list(specs['instructions'].keys())) == 1, f"{tool_name}: multiple instructions were given " \ f"({str(list(specs['instructions'].keys()))[1:-1]}), but only one instruction of type " \ f"{expected_instruction} should be specified." instruction_name = list(specs['instructions'].keys())[0] instruction_type = specs['instructions'][instruction_name]['type'] assert instruction_type == expected_instruction, \ f"{tool_name}: instruction type has to be '{expected_instruction}', got {instruction_type} instead." return instruction_name
def __init__(self, percentage: float, show_warnings: bool = True): super().__init__() ParameterValidator.assert_type_and_value(percentage, float, "TCRdistClassifier", "percentage", min_inclusive=0., max_inclusive=1.) self.percentage = percentage self.k = None self.label = None self.show_warnings = show_warnings
def _parse_settings(self, instruction: dict, symbol_table: SymbolTable) -> list: try: settings = [] for index, setting in enumerate(instruction["settings"]): if "preprocessing" in setting: ParameterValidator.assert_type_and_value( setting["preprocessing"], str, TrainMLModelParser.__name__, f'settings: {index+1}. ' f'element: preprocessing') if symbol_table.contains(setting["preprocessing"]): preprocessing_sequence = symbol_table.get( setting["preprocessing"]) preproc_name = setting["preprocessing"] else: raise KeyError( f"{TrainMLModelParser.__name__}: preprocessing was set in the TrainMLModel instruction to value " f"{setting['preprocessing']}, but no such preprocessing was defined in the specification under " f"definitions: {PreprocessingParser.keyword}.") else: setting["preprocessing"] = None preprocessing_sequence = [] preproc_name = None ParameterValidator.assert_keys( setting.keys(), ["preprocessing", "ml_method", "encoding"], TrainMLModelParser.__name__, f"settings, {index + 1}. entry") encoder = symbol_table.get(setting["encoding"]).build_object(symbol_table.get(instruction["dataset"]), **symbol_table.get_config(setting["encoding"])["encoder_params"])\ .set_context({"dataset": symbol_table.get(instruction['dataset'])}) s = HPSetting(encoder=encoder, encoder_name=setting["encoding"], encoder_params=symbol_table.get_config( setting["encoding"])["encoder_params"], ml_method=symbol_table.get(setting["ml_method"]), ml_method_name=setting["ml_method"], ml_params=symbol_table.get_config( setting["ml_method"]), preproc_sequence=preprocessing_sequence, preproc_sequence_name=preproc_name) settings.append(s) return settings except KeyError as key_error: raise KeyError( f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under settings in TrainMLModel instruction." )
def check_export_format(specs: dict, tool_name: str, instruction_name: str): ParameterValidator.assert_keys_present( list(specs['instructions'][instruction_name].keys()), ["export_formats"], tool_name, f"{instruction_name}/export_formats") ParameterValidator.assert_type_and_value( specs['instructions'][instruction_name]["export_formats"], list, tool_name, f"{instruction_name}/export_formats") assert len(specs['instructions'][instruction_name]["export_formats"]) == 1, \ f"{tool_name}: only one format can be specified under export_formats parameter under " \ f"{instruction_name}/export_formats, got {specs['instructions'][instruction_name]['export_formats']} instead." return specs['instructions'][instruction_name]["export_formats"][0]
def parse_signals(signals: dict, symbol_table: SymbolTable): for key, signal_spec in signals.items(): ParameterValidator.assert_keys_present(signal_spec.keys(), SignalParser.VALID_KEYS, "SignalParser", key) implanting_strategy = SignalParser._get_implanting_strategy(key, signal_spec) ParameterValidator.assert_keys(signal_spec["motifs"], symbol_table.get_keys_by_type(SymbolType.MOTIF), "SignalParser", f"motifs in signal {key}", False) signal_motifs = [symbol_table.get(motif_id) for motif_id in signal_spec["motifs"]] signal = Signal(key, signal_motifs, implanting_strategy) symbol_table.add(key, SymbolType.SIGNAL, signal) return symbol_table, signals
def build_object(cls, **kwargs): if kwargs["additional_node_attributes"] is None: kwargs["additional_node_attributes"] = [] if kwargs["additional_edge_attributes"] is None: kwargs["additional_edge_attributes"] = [] ParameterValidator.assert_type_and_value( kwargs["additional_node_attributes"], list, "CytoscapeNetworkExporter", "additional_node_attributes") ParameterValidator.assert_type_and_value( kwargs["additional_edge_attributes"], list, "CytoscapeNetworkExporter", "additional_edge_attributes") return CytoscapeNetworkExporter(**kwargs)
def __init__(self, k: int, skip_first_n_aa: int, skip_last_n_aa: int, abundance: str, normalize_all_features: bool, name: str = None): location = "AtchleyKmerEncoder" ParameterValidator.assert_type_and_value(k, int, location, "k", 1) ParameterValidator.assert_type_and_value(skip_first_n_aa, int, location, "skip_first_n_aa", 0) ParameterValidator.assert_type_and_value(skip_last_n_aa, int, location, "skip_last_n_aa", 0) ParameterValidator.assert_in_valid_list(abundance.upper(), [ab.name for ab in RelativeAbundanceType], location, "abundance") ParameterValidator.assert_type_and_value(normalize_all_features, bool, location, "normalize_all_features") self.k = k self.skip_first_n_aa = skip_first_n_aa self.skip_last_n_aa = skip_last_n_aa self.abundance = RelativeAbundanceType[abundance.upper()] self.normalize_all_features = normalize_all_features self.name = name self.scaler_path = None self.vectorizer_path = None
def _prepare_parameters(distance_metric: str, attributes_to_match: list, sequence_batch_size: int, context: dict = None): valid_metrics = [metric.name for metric in DistanceMetricType] ParameterValidator.assert_in_valid_list(distance_metric, valid_metrics, "DistanceEncoder", "distance_metric") return { "distance_metric": DistanceMetricType[distance_metric.upper()], "attributes_to_match": attributes_to_match, "sequence_batch_size": sequence_batch_size, "context": context }
def build_object(cls, **kwargs): location = "DeepRCMotifDiscovery" name = kwargs["name"] if "name" in kwargs else None ParameterValidator.assert_type_and_value(kwargs["n_steps"], int, location, "n_steps", min_inclusive=1) ParameterValidator.assert_type_and_value(kwargs["threshold"], float, location, "threshold", min_inclusive=0, max_inclusive=1) return DeepRCMotifDiscovery(n_steps=kwargs["n_steps"], threshold=kwargs["threshold"], name=name)
def parse_motifs(motifs: dict, symbol_table: SymbolTable): valid_motif_keys = [ "seed", "instantiation", "seed_chain1", "seed_chain2", "name_chain1", "name_chain2" ] for key in motifs.keys(): ParameterValidator.assert_keys(motifs[key].keys(), valid_motif_keys, "MotifParser", key, exclusive=False) motif = MotifParser._parse_motif(key, motifs[key]) symbol_table.add(key, SymbolType.MOTIF, motif) return symbol_table, motifs
def parse_encoder(key: str, specs: dict): class_path = "encodings" valid_encoders = ReflectionHandler.all_nonabstract_subclass_basic_names( DatasetEncoder, "Encoder", class_path) encoder = ObjectParser.get_class(specs, valid_encoders, "Encoder", class_path, "EncodingParser", key) params = ObjectParser.get_all_params(specs, class_path, encoder.__name__[:-7], key) required_params = [ p for p in list( inspect.signature(encoder.__init__).parameters.keys()) if p != "self" ] ParameterValidator.assert_all_in_valid_list( params.keys(), required_params, "EncoderParser", f"{key}/{encoder.__name__.replace('Encoder', '')}") return encoder, params
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: str = None) -> ExploratoryAnalysisInstruction: exp_analysis_units = {} ParameterValidator.assert_keys(instruction, ["analyses", "type"], "ExploratoryAnalysisParser", "ExploratoryAnalysis") for analysis_key, analysis in instruction["analyses"].items(): params = self._prepare_params(analysis, symbol_table) exp_analysis_units[analysis_key] = ExploratoryAnalysisUnit( **params) process = ExploratoryAnalysisInstruction( exploratory_analysis_units=exp_analysis_units, name=key) return process
def parse_instruction(key: str, instruction: dict, symbol_table: SymbolTable, path) -> tuple: ParameterValidator.assert_keys_present(list(instruction.keys()), ["type"], InstructionParser.__name__, key) valid_instructions = [ cls[:-6] for cls in ReflectionHandler.discover_classes_by_partial_name( "Parser", "dsl/instruction_parsers/") ] ParameterValidator.assert_in_valid_list(instruction["type"], valid_instructions, "InstructionParser", "type") parser = ReflectionHandler.get_class_by_name( "{}Parser".format(instruction["type"]), "instruction_parsers/")() instruction_object = parser.parse(key, instruction, symbol_table, path) symbol_table.add(key, SymbolType.INSTRUCTION, instruction_object) return instruction, symbol_table