def prepare_specs(self): with self.yaml_path.open("r") as file: specs = yaml.safe_load(file) self.instruction_name = Util.check_instruction_type( specs, DataSimulationTool.__name__, self.expected_instruction) self.export_format = Util.check_export_format( specs, DataSimulationTool.__name__, self.instruction_name) ParameterValidator.assert_keys_present(specs["definitions"], ["datasets"], DataSimulationTool.__name__, "definitions/datasets") ParameterValidator.assert_type_and_value( specs['definitions']['datasets'], dict, DataSimulationTool.__name__, "definitions/datasets") dataset_names = list(specs['definitions']['datasets'].keys()) assert len(dataset_names) == 1, f"{DataSimulationTool.__name__}: one dataset has to be defined under definitions/datasets, got " \ f"{dataset_names} instead." self.dataset_name = dataset_names[0] Util.check_paths(specs, DataSimulationTool.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def _parse_settings(self, instruction: dict, symbol_table: SymbolTable) -> list: try: settings = [] for index, setting in enumerate(instruction["settings"]): if "preprocessing" in setting and setting[ "preprocessing"] is not None: ParameterValidator.assert_type_and_value( setting["preprocessing"], str, TrainMLModelParser.__name__, f'settings: {index+1}. ' f'element: preprocessing') if symbol_table.contains(setting["preprocessing"]): preprocessing_sequence = symbol_table.get( setting["preprocessing"]) preproc_name = setting["preprocessing"] if not all(preproc.keeps_example_count() for preproc in preprocessing_sequence): raise ValueError( f"{TrainMLModelParser.__name__}: preprocessing sequence {preproc_name} includes preprocessing that " f"change the number of examples at runtime and as such cannot be used with this instruction. See the " f"documentation for the preprocessing or alternatively use them with other instructions." ) else: raise KeyError( f"{TrainMLModelParser.__name__}: preprocessing was set in the TrainMLModel instruction to value " f"{setting['preprocessing']}, but no such preprocessing was defined in the specification under " f"definitions: {PreprocessingParser.keyword}.") else: setting["preprocessing"] = None preprocessing_sequence = [] preproc_name = None ParameterValidator.assert_keys( setting.keys(), ["preprocessing", "ml_method", "encoding"], TrainMLModelParser.__name__, f"settings, {index + 1}. entry") encoder = symbol_table.get(setting["encoding"]).build_object(symbol_table.get(instruction["dataset"]), **symbol_table.get_config(setting["encoding"])["encoder_params"])\ .set_context({"dataset": symbol_table.get(instruction['dataset'])}) ml_method = symbol_table.get(setting["ml_method"]) ml_method.check_encoder_compatibility(encoder) s = HPSetting(encoder=encoder, encoder_name=setting["encoding"], encoder_params=symbol_table.get_config( setting["encoding"])["encoder_params"], ml_method=ml_method, ml_method_name=setting["ml_method"], ml_params=symbol_table.get_config( setting["ml_method"]), preproc_sequence=preprocessing_sequence, preproc_sequence_name=preproc_name) settings.append(s) return settings except KeyError as key_error: raise KeyError( f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under settings in TrainMLModel instruction." )
def _prepare_parameters(reference: dict, max_edit_distances: dict, name: str = None): location = "MatchedReceptorsEncoder" legal_chains = [ chain for receptor in (TCABReceptor(), TCGDReceptor(), BCReceptor()) for chain in receptor.get_chains() ] if type(max_edit_distances) is int: max_edit_distances = { chain: max_edit_distances for chain in legal_chains } elif type(max_edit_distances) is dict: ParameterValidator.assert_keys(max_edit_distances.keys(), legal_chains, location, "max_edit_distances", exclusive=False) else: ParameterValidator.assert_type_and_value(max_edit_distances, dict, location, 'max_edit_distances') reference_receptors = MatchedReferenceUtil.prepare_reference( reference, location=location, paired=True) return { "reference_receptors": reference_receptors, "max_edit_distances": max_edit_distances, "name": name }
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path = None) -> ExploratoryAnalysisInstruction: exp_analysis_units = {} ParameterValidator.assert_keys( instruction, ["analyses", "type", "number_of_processes"], "ExploratoryAnalysisParser", "ExploratoryAnalysis") ParameterValidator.assert_type_and_value( instruction["number_of_processes"], int, ExploratoryAnalysisParser.__name__, "number_of_processes") for analysis_key, analysis in instruction["analyses"].items(): params = self._prepare_params(analysis, symbol_table, f"{key}/{analysis_key}") params["number_of_processes"] = instruction["number_of_processes"] exp_analysis_units[analysis_key] = ExploratoryAnalysisUnit( **params) process = ExploratoryAnalysisInstruction( exploratory_analysis_units=exp_analysis_units, name=key) return process
def build_object(cls, **kwargs): comparison_label = kwargs[ "comparison_label"] if "comparison_label" in kwargs else None color_grouping_label = kwargs[ "color_grouping_label"] if "color_grouping_label" in kwargs else None row_grouping_label = kwargs[ "row_grouping_label"] if "row_grouping_label" in kwargs else None column_grouping_label = kwargs[ "column_grouping_label"] if "column_grouping_label" in kwargs else None log_scale = kwargs["log_scale"] if "log_scale" in kwargs else None keep_fraction = float( kwargs["keep_fraction"]) if "keep_fraction" in kwargs else 1.0 ParameterValidator.assert_type_and_value(keep_fraction, float, "FeatureComparison", "keep_fraction", min_inclusive=0, max_inclusive=1) ParameterValidator.assert_type_and_value(log_scale, bool, "FeatureComparison", "log_scale") assert comparison_label is not None, "FeatureComparison: the parameter 'comparison_label' must be set in order to be able to compare across this label" assert comparison_label != color_grouping_label, f"FeatureComparison: comparison label {comparison_label} can not be used as color_grouping_label" assert comparison_label != row_grouping_label, f"FeatureComparison: comparison label {comparison_label} can not be used as row_grouping_label" assert comparison_label != column_grouping_label, f"FeatureComparison: comparison label {comparison_label} can not be used as column_grouping_label" return FeatureComparison(**kwargs)
def _prepare_specs(self): with self.yaml_path.open("r") as file: specs = yaml.safe_load(file) ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], GalaxyTrainMLModel.__name__, "YAML specification") ParameterValidator.assert_all_in_valid_list(specs.keys(), ["definitions", "instructions", "output"], GalaxyTrainMLModel.__name__, "YAML specification") ParameterValidator.assert_type_and_value(specs["instructions"], dict, GalaxyTrainMLModel.__name__, "instructions") assert len(list(specs["instructions"].keys())) == 1, f"{GalaxyTrainMLModel.__name__}: one instruction has to be specified under " \ f"`instructions`, got the following instead: {list(specs['instructions'].keys())}." self.instruction_name = list(specs["instructions"].keys())[0] ParameterValidator.assert_type_and_value(specs['instructions'][self.instruction_name], dict, GalaxyTrainMLModel.__name__, self.instruction_name) ParameterValidator.assert_keys_present(specs['instructions'][self.instruction_name].keys(), ['type'], GalaxyTrainMLModel.__name__, self.instruction_name) assert specs['instructions'][self.instruction_name]['type'] == TrainMLModelInstruction.__name__[:-11], \ f"{GalaxyTrainMLModel.__name__}: instruction `type` under {self.instruction_name} has to be {TrainMLModelInstruction.__name__[:-11]} " \ f"for this tool." assert len( specs['instructions'][self.instruction_name]['labels']) == 1, f"{GalaxyTrainMLModel.__name__}: one label has to be specified under " \ f"`labels`, got the following instead: {specs['instructions'][self.instruction_name]['labels']}." Util.check_paths(specs, GalaxyTrainMLModel.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def _prepare_parameters(motif_filepath: str, match_v_genes: bool, sum_counts: bool, name: str = None): ParameterValidator.assert_type_and_value(match_v_genes, bool, "MatchedRegexEncoder", "match_v_genes") ParameterValidator.assert_type_and_value(sum_counts, bool, "MatchedRegexEncoder", "sum_counts") motif_filepath = Path(motif_filepath) assert motif_filepath.is_file(), f"MatchedRegexEncoder: the file {motif_filepath} does not exist. " \ f"Specify the correct path under motif_filepath." file_columns = list(pd.read_csv(motif_filepath, sep="\t", iterator=False, dtype=str, nrows=0).columns) ParameterValidator.assert_all_in_valid_list(file_columns, ["id"] + [f"{c.value}V" for c in Chain] + [f"{c.value}_regex" for c in Chain], "MatchedRegexEncoder", "motif_filepath (column names)") chains = [colname.split("_")[0] for colname in file_columns if colname.endswith("_regex")] if match_v_genes: for chain in chains: assert f"{chain}V" in file_columns, f"MatchedRegexEncoder: expected column {chain}V to be present in the columns of motif_filepath. " \ f"Remove {chain}_regex from columns, or set match_v_genes to False." return { "motif_filepath": motif_filepath, "match_v_genes": match_v_genes, "sum_counts": sum_counts, "chains": chains, "name": name }
def _prepare_report_config(self, instruction_key, instruction, split_key, symbol_table): if "reports" in instruction[split_key] and len( instruction[split_key]["reports"]) > 0: location = f"{instruction_key}/{split_key}/reports" report_types = list(signature(ReportConfig).parameters.keys()) ParameterValidator.assert_all_in_valid_list( instruction[split_key]["reports"].keys(), report_types, location, "reports") for report_type in instruction[split_key]["reports"]: ParameterValidator.assert_type_and_value( instruction[split_key]["reports"][report_type], list, f"{location}/{report_type}", report_type) report_config_input = { report_type: { report_id: symbol_table.get(report_id) for report_id in instruction[split_key]["reports"] [report_type] } for report_type in instruction[split_key]["reports"] } else: report_config_input = {} return report_config_input
def _prepare_reports(self, reports: list, symbol_table: SymbolTable) -> dict: if reports is not None: ParameterValidator.assert_type_and_value(reports, list, TrainMLModelParser.__name__, "reports") report_objects = {report_id: symbol_table.get(report_id) for report_id in reports} ParameterValidator.assert_all_type_and_value(report_objects.values(), TrainMLModelReport, TrainMLModelParser.__name__, 'reports') return report_objects else: return {}
def _check_label_format(self, labels: list, instruction_key: str): ParameterValidator.assert_type_and_value(labels, list, TrainMLModelParser.__name__, f'{instruction_key}/labels') assert all(isinstance(label, str) or isinstance(label, dict) for label in labels), \ f"{TrainMLModelParser.__name__}: labels under {instruction_key} were not defined properly. The list of labels has to either be a list of " \ f"label names, or there can be a parameter 'positive_class' defined under the label name." assert all(len(list(label.keys())) == 1 and isinstance(list(label.values())[0], dict) and 'positive_class' in list(label.values())[0] and len(list(list(label.values())[0].keys())) == 1 for label in [l for l in labels if isinstance(l, dict)]), \ f"{TrainMLModelParser.__name__}: labels that are specified by more than label name, can include only one parameter called 'positive_class'."
def __init__(self, percentage: float, show_warnings: bool = True): super().__init__() ParameterValidator.assert_type_and_value(percentage, float, "TCRdistClassifier", "percentage", min_inclusive=0., max_inclusive=1.) self.percentage = percentage self.k = None self.label = None self.show_warnings = show_warnings
def build_object(cls, **kwargs): location = cls.__name__ ParameterValidator.assert_keys(kwargs.keys(), ["filter_sequence_type", "batch_size", "count_agg"], location, "DuplicateSequenceFilter") ParameterValidator.assert_in_valid_list(kwargs["filter_sequence_type"].upper(), [item.name for item in SequenceType], location, "filter_sequence_type") ParameterValidator.assert_in_valid_list(kwargs["count_agg"].upper(), [item.name for item in CountAggregationFunction], location, "count_agg") ParameterValidator.assert_type_and_value(kwargs["batch_size"], int, location, "batch_size", 1) return DuplicateSequenceFilter(filter_sequence_type=SequenceType[kwargs["filter_sequence_type"].upper()], batch_size=kwargs["batch_size"], count_agg=CountAggregationFunction[kwargs["count_agg"].upper()])
def build_object(cls, **kwargs): if kwargs["additional_node_attributes"] is None: kwargs["additional_node_attributes"] = [] if kwargs["additional_edge_attributes"] is None: kwargs["additional_edge_attributes"] = [] ParameterValidator.assert_type_and_value(kwargs["additional_node_attributes"], list, "CytoscapeNetworkExporter", "additional_node_attributes") ParameterValidator.assert_type_and_value(kwargs["additional_edge_attributes"], list, "CytoscapeNetworkExporter", "additional_edge_attributes") return CytoscapeNetworkExporter(**kwargs)
def _prepare_parameters(normalization_type: str, reads: str, sequence_encoding: str, k: int = 0, k_left: int = 0, k_right: int = 0, min_gap: int = 0, max_gap: int = 0, metadata_fields_to_include: list = None, name: str = None, scale_to_unit_variance: bool = False, scale_to_zero_mean: bool = False, sequence_type: str = None): location = KmerFrequencyEncoder.__name__ ParameterValidator.assert_in_valid_list(normalization_type.upper(), [item.name for item in NormalizationType], location, "normalization_type") ParameterValidator.assert_in_valid_list(reads.upper(), [item.name for item in ReadsType], location, "reads") ParameterValidator.assert_in_valid_list(sequence_encoding.upper(), [item.name for item in SequenceEncodingType], location, "sequence_encoding") ParameterValidator.assert_type_and_value(scale_to_zero_mean, bool, location, "scale_to_zero_mean") ParameterValidator.assert_type_and_value(scale_to_unit_variance, bool, location, "scale_to_unit_variance") ParameterValidator.assert_type_and_value(sequence_type, str, location, 'sequence_type') ParameterValidator.assert_in_valid_list(sequence_type.upper(), [st.name for st in SequenceType], location, 'sequence_type') if "IMGT" in sequence_encoding.upper(): assert sequence_type.upper() == SequenceType.AMINO_ACID.name, f"{location}: for IMGT-based k-mer frequency encoding (here: " \ f"{sequence_encoding.upper()}), sequence type has to be 'amino_acid'." vars_to_check = {"k": k, "k_left": k_left, "k_right": k_right, "min_gap": min_gap, "max_gap": max_gap} for param in vars_to_check.keys(): ParameterValidator.assert_type_and_value(vars_to_check[param], int, location, param, min_inclusive=0) if "gap" in sequence_encoding.lower(): assert k_left != 0 and k_right != 0, f"KmerFrequencyEncoder: sequence encoding {sequence_encoding} was chosen, but k_left " \ f"({k_left}) or k_right ({k_right}) have to be set and larger than 0." return { "normalization_type": NormalizationType[normalization_type.upper()], "reads": ReadsType[reads.upper()], "sequence_encoding": SequenceEncodingType[sequence_encoding.upper()], "name": name, "scale_to_zero_mean": scale_to_zero_mean, "scale_to_unit_variance": scale_to_unit_variance, 'sequence_type': SequenceType[sequence_type.upper()], **vars_to_check }
def __init__(self, k: int, skip_first_n_aa: int, skip_last_n_aa: int, abundance: str, normalize_all_features: bool, name: str = None): location = "AtchleyKmerEncoder" ParameterValidator.assert_type_and_value(k, int, location, "k", 1) ParameterValidator.assert_type_and_value(skip_first_n_aa, int, location, "skip_first_n_aa", 0) ParameterValidator.assert_type_and_value(skip_last_n_aa, int, location, "skip_last_n_aa", 0) ParameterValidator.assert_in_valid_list( abundance.upper(), [ab.name for ab in RelativeAbundanceType], location, "abundance") ParameterValidator.assert_type_and_value(normalize_all_features, bool, location, "normalize_all_features") self.k = k self.skip_first_n_aa = skip_first_n_aa self.skip_last_n_aa = skip_last_n_aa self.abundance = RelativeAbundanceType[abundance.upper()] self.normalize_all_features = normalize_all_features self.name = name self.scaler_path = None self.vectorizer_path = None
def _prepare_parameters(use_positional_info: bool, distance_to_seq_middle: int, flatten: bool, sequence_type: str, name: str = None): location = OneHotEncoder.__name__ ParameterValidator.assert_type_and_value(use_positional_info, bool, location, "use_positional_info") if use_positional_info: ParameterValidator.assert_type_and_value(distance_to_seq_middle, int, location, "distance_to_seq_middle", min_inclusive=1) else: distance_to_seq_middle = None ParameterValidator.assert_type_and_value(flatten, bool, location, "flatten") ParameterValidator.assert_type_and_value(sequence_type, str, location, 'sequence_type') ParameterValidator.assert_in_valid_list( sequence_type.upper(), [item.name for item in SequenceType], location, 'sequence_type') return { "use_positional_info": use_positional_info, "distance_to_seq_middle": distance_to_seq_middle, "flatten": flatten, "sequence_type": SequenceType[sequence_type.upper()], "name": name }
def _parse_settings(self, instruction: dict, symbol_table: SymbolTable) -> list: try: settings = [] for index, setting in enumerate(instruction["settings"]): if "preprocessing" in setting: ParameterValidator.assert_type_and_value( setting["preprocessing"], str, TrainMLModelParser.__name__, f'settings: {index+1}. ' f'element: preprocessing') if symbol_table.contains(setting["preprocessing"]): preprocessing_sequence = symbol_table.get( setting["preprocessing"]) preproc_name = setting["preprocessing"] else: raise KeyError( f"{TrainMLModelParser.__name__}: preprocessing was set in the TrainMLModel instruction to value " f"{setting['preprocessing']}, but no such preprocessing was defined in the specification under " f"definitions: {PreprocessingParser.keyword}.") else: setting["preprocessing"] = None preprocessing_sequence = [] preproc_name = None ParameterValidator.assert_keys( setting.keys(), ["preprocessing", "ml_method", "encoding"], TrainMLModelParser.__name__, f"settings, {index + 1}. entry") encoder = symbol_table.get(setting["encoding"]).build_object(symbol_table.get(instruction["dataset"]), **symbol_table.get_config(setting["encoding"])["encoder_params"])\ .set_context({"dataset": symbol_table.get(instruction['dataset'])}) s = HPSetting(encoder=encoder, encoder_name=setting["encoding"], encoder_params=symbol_table.get_config( setting["encoding"])["encoder_params"], ml_method=symbol_table.get(setting["ml_method"]), ml_method_name=setting["ml_method"], ml_params=symbol_table.get_config( setting["ml_method"]), preproc_sequence=preprocessing_sequence, preproc_sequence_name=preproc_name) settings.append(s) return settings except KeyError as key_error: raise KeyError( f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under settings in TrainMLModel instruction." )
def check_export_format(specs: dict, tool_name: str, instruction_name: str): ParameterValidator.assert_keys_present( list(specs['instructions'][instruction_name].keys()), ["export_formats"], tool_name, f"{instruction_name}/export_formats") ParameterValidator.assert_type_and_value( specs['instructions'][instruction_name]["export_formats"], list, tool_name, f"{instruction_name}/export_formats") assert len(specs['instructions'][instruction_name]["export_formats"]) == 1, \ f"{tool_name}: only one format can be specified under export_formats parameter under " \ f"{instruction_name}/export_formats, got {specs['instructions'][instruction_name]['export_formats']} instead." return specs['instructions'][instruction_name]["export_formats"][0]
def _check_instruction(self, specs): instruction_name = Util.check_instruction_type( specs, DatasetGenerationTool.__name__, DatasetExportInstruction.__name__[:-11]) for key in ['datasets', 'export_formats']: ParameterValidator.assert_keys_present( list(specs['instructions'][instruction_name].keys()), [key], DatasetGenerationTool.__name__, instruction_name) ParameterValidator.assert_type_and_value( specs["instructions"][instruction_name][key], list, DatasetGenerationTool.__name__, f"{instruction_name}/{key}") assert len(specs['instructions'][instruction_name][key]) == 1, \ f"{DatasetGenerationTool.__name__}: this tool accepts only one item under {key}, got {specs['instructions'][instruction_name][key]} " \ f"instead."
def _check_dataset_specs(self, workflow_specification, location): ParameterValidator.assert_type_and_value( workflow_specification['definitions'], dict, location, 'definitions') ParameterValidator.assert_keys_present( workflow_specification['definitions'].keys(), ['datasets'], location, 'definitions') ParameterValidator.assert_type_and_value( workflow_specification['definitions']['datasets'], dict, location, 'datasets') dataset_names = list( workflow_specification['definitions']['datasets'].keys()) assert len(dataset_names) > 1, \ f"MultiDatasetBenchmarkTool: there is only one dataset specified ({dataset_names[0]}), while this tool operates on multiple datasets. " \ f"If only one dataset is needed, consider using the training instruction directly."
def check_label_format(labels: list, instruction_name: str, yaml_location: str): ParameterValidator.assert_type_and_value(labels, list, instruction_name, f'{yaml_location}/labels') assert all(isinstance(label, str) or isinstance(label, dict) for label in labels), \ f"{instruction_name}: labels under {yaml_location} were not defined properly. The list of labels has to either be a list of " \ f"label names, or there can be a parameter 'positive_class' defined under the label name, for example:\n" \ f"labels: # one label with no positive class (T1D) and one with a positive class (CMV)\n" \ f"- T1D\n" \ f"- CMV: # when defining a positive class, make sure to use the correct indentation\n" \ f" positive_class: True\n" \ assert all(len(list(label.keys())) == 1 and isinstance(list(label.values())[0], dict) and 'positive_class' in list(label.values())[0] and len(list(list(label.values())[0].keys())) == 1 for label in [l for l in labels if isinstance(l, dict)]), \ f"{instruction_name}: The only legal parameter under a label name is 'positive_class'. If 'positive_class' is not specified, please remove the colon after the label name. "
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path = None) -> SubsamplingInstruction: valid_keys = [ "type", "dataset", "subsampled_dataset_sizes", "dataset_export_formats" ] ParameterValidator.assert_keys(instruction.keys(), valid_keys, SubsamplingParser.__name__, key) dataset_keys = symbol_table.get_keys_by_type(SymbolType.DATASET) ParameterValidator.assert_in_valid_list(instruction['dataset'], dataset_keys, SubsamplingParser.__name__, f'{key}/dataset') dataset = symbol_table.get(instruction['dataset']) ParameterValidator.assert_type_and_value( instruction['subsampled_dataset_sizes'], list, SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes') ParameterValidator.assert_all_type_and_value( instruction['subsampled_dataset_sizes'], int, SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes', 1, dataset.get_example_count()) valid_export_formats = ReflectionHandler.all_nonabstract_subclass_basic_names( DataExporter, 'Exporter', "dataset_export/") ParameterValidator.assert_type_and_value( instruction['dataset_export_formats'], list, SubsamplingParser.__name__, f"{key}/dataset_export_formats") ParameterValidator.assert_all_in_valid_list( instruction['dataset_export_formats'], valid_export_formats, SubsamplingParser.__name__, f"{key}/dataset_export_formats") return SubsamplingInstruction( dataset=dataset, subsampled_dataset_sizes=instruction['subsampled_dataset_sizes'], dataset_export_formats=[ ReflectionHandler.get_class_by_name(export_format + "Exporter", "dataset_export/") for export_format in instruction['dataset_export_formats'] ], name=key)
def build_object(cls, **kwargs): location = "DeepRCMotifDiscovery" name = kwargs["name"] if "name" in kwargs else None ParameterValidator.assert_type_and_value(kwargs["n_steps"], int, location, "n_steps", min_inclusive=1) ParameterValidator.assert_type_and_value(kwargs["threshold"], float, location, "threshold", min_inclusive=0, max_inclusive=1) return DeepRCMotifDiscovery(n_steps=kwargs["n_steps"], threshold=kwargs["threshold"], name=name)
def build_object(cls, **kwargs): ParameterValidator.assert_keys(kwargs.keys(), ['metadata_labels', 'name'], ConfounderAnalysis.__name__, ConfounderAnalysis.__name__) ParameterValidator.assert_type_and_value(kwargs['metadata_labels'], list, ConfounderAnalysis.__name__, 'metadata_labels') ParameterValidator.assert_all_type_and_value( kwargs['metadata_labels'], str, ConfounderAnalysis.__name__, 'metadata_labels') ParameterValidator.assert_type_and_value(kwargs['name'], str, ConfounderAnalysis.__name__, 'name') return ConfounderAnalysis(metadata_labels=kwargs['metadata_labels'], name=kwargs['name'])
def build_object(cls, **kwargs): location = "Coefficients" coefs_to_plot = [coef.upper() for coef in kwargs["coefs_to_plot"]] name = kwargs["name"] if "name" in kwargs else None ParameterValidator.assert_all_in_valid_list( coefs_to_plot, [item.name.upper() for item in CoefficientPlottingSetting], location, "coefs_to_plot") if CoefficientPlottingSetting.CUTOFF.name in coefs_to_plot: cutoff = kwargs["cutoff"] ParameterValidator.assert_type_and_value(cutoff, list, location, "cutoff") ParameterValidator.assert_all_type_and_value(cutoff, Number, location, "cutoff", min_inclusive=1e-15) else: cutoff = [] if CoefficientPlottingSetting.N_LARGEST.name in coefs_to_plot: n_largest = kwargs["n_largest"] ParameterValidator.assert_type_and_value(n_largest, list, location, "n_largest") ParameterValidator.assert_all_type_and_value(n_largest, int, location, "n_largest", min_inclusive=1) else: n_largest = [] coefs = CoefficientPlottingSettingList() for keyword in coefs_to_plot: coefs.append(CoefficientPlottingSetting[keyword.upper()]) return Coefficients(coefs_to_plot=coefs, cutoff=cutoff, n_largest=n_largest, name=name)
def _prepare_parameters(max_edit_distance: int, reference: dict, name: str = None): location = "MatchedSequencesEncoder" ParameterValidator.assert_type_and_value(max_edit_distance, int, location, "max_edit_distance", min_inclusive=0) reference_sequences = MatchedReferenceUtil.prepare_reference( reference_params=reference, location=location, paired=False) return { "max_edit_distance": max_edit_distance, "reference_sequences": reference_sequences, "name": name }
def create_method_instance(ml_specification: dict, ml_method_class, key: str) -> tuple: ml_params = {} if ml_specification[ml_method_class.__name__] is None or len( ml_specification[ml_method_class.__name__].keys()) == 0: ml_method = ml_method_class() else: ml_params = ml_specification[ml_method_class.__name__] init_method_keys = inspect.signature( ml_method_class.__init__).parameters.keys() if any( [isinstance(ml_params[key], list) for key in ml_params.keys() ]) and "parameter_grid" in init_method_keys: ParameterValidator.assert_type_and_value( ml_specification['model_selection_cv'], bool, MLParser.__name__, f'{key}: model_selection_cv', exact_value=True) ParameterValidator.assert_type_and_value( ml_specification['model_selection_n_folds'], int, MLParser.__name__, f'{key}: model_selection_n_folds', 2) ml_method = ml_method_class( parameter_grid={ key: [ml_params[key]] if not isinstance( ml_params[key], list) else ml_params[key] for key in ml_params.keys() }) elif len(init_method_keys) == 3 and all( arg in init_method_keys for arg in ["parameters", "parameter_grid"]): ml_method = ml_method_class(parameters=ml_params) else: ml_method = ml_method_class(**ml_params) return ml_method, ml_params
def create_method_instance(ml_specification: dict, ml_method_class, key: str) -> tuple: ml_params = {} if ml_specification[ml_method_class.__name__] is None or len( ml_specification[ml_method_class.__name__].keys()) == 0: ml_method = ml_method_class() else: ml_params = ml_specification[ml_method_class.__name__] init_method_keys = inspect.signature( ml_method_class.__init__).parameters.keys() if any( [isinstance(ml_params[key], list) for key in ml_params.keys() ]) and "parameter_grid" in init_method_keys: ParameterValidator.assert_type_and_value( ml_specification['model_selection_cv'], bool, MLParser.__name__, f'{key}: model_selection_cv') assert ml_specification['model_selection_cv'] == True, f"MLParser: when running ML method {key} with a list of inputs, model_selection_cv must be True! " \ f"Set the parameters for {key} to single values (not lists) or set model_selection_cv to True and model_selection_n_folds to >= 2" ParameterValidator.assert_type_and_value( ml_specification['model_selection_n_folds'], int, MLParser.__name__, f'{key}: model_selection_n_folds', 2) ml_method = ml_method_class( parameter_grid={ key: [ml_params[key]] if not isinstance( ml_params[key], list) else ml_params[key] for key in ml_params.keys() }) elif len(init_method_keys) == 3 and all( arg in init_method_keys for arg in ["parameters", "parameter_grid"]): ml_method = ml_method_class(parameters=ml_params) else: ml_method = ml_method_class(**ml_params) return ml_method, ml_params
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path) -> MLApplicationInstruction: location = MLApplicationParser.__name__ ParameterValidator.assert_keys(instruction.keys(), [ 'type', 'dataset', 'number_of_processes', 'config_path', 'store_encoded_data' ], location, key) ParameterValidator.assert_in_valid_list( instruction['dataset'], symbol_table.get_keys_by_type(SymbolType.DATASET), location, f"{key}: dataset") ParameterValidator.assert_type_and_value( instruction['number_of_processes'], int, location, f"{key}: number_of_processes", min_inclusive=1) ParameterValidator.assert_type_and_value(instruction['config_path'], str, location, f'{key}: config_path') ParameterValidator.assert_type_and_value( instruction['store_encoded_data'], bool, location, f'{key}: store_encoded_data') hp_setting, label = self._parse_hp_setting(instruction, path, key) instruction = MLApplicationInstruction( dataset=symbol_table.get(instruction['dataset']), name=key, number_of_processes=instruction['number_of_processes'], label_configuration=LabelConfiguration([label]), hp_setting=hp_setting, store_encoded_data=instruction['store_encoded_data']) return instruction
def _prepare_parameters(use_positional_info, distance_to_seq_middle, flatten, name: str = None): location = OneHotEncoder.__name__ ParameterValidator.assert_type_and_value(use_positional_info, bool, location, "use_positional_info") if use_positional_info: ParameterValidator.assert_type_and_value(distance_to_seq_middle, int, location, "distance_to_seq_middle", min_inclusive=1) else: distance_to_seq_middle = None ParameterValidator.assert_type_and_value(flatten, bool, location, "flatten") return { "use_positional_info": use_positional_info, "distance_to_seq_middle": distance_to_seq_middle, "flatten": flatten, "name": name }