def create_pipeline(metric: str) -> Pipeline: # create the basic pipeline kanine_pipeline = Pipeline(context=PipelineContext.TESTING) kanine_pipeline.add_input(name='inputs') # Denormalize so that we have a single dataframe in the dataset step = PrimitiveStep( primitive_description=DenormalizePrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step.add_output('produce') kanine_pipeline.add_step(step) # step 1 - kanine classification step = PrimitiveStep(primitive_description=Kanine.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step.add_output('produce') kanine_pipeline.add_step(step) # Adding output step to the pipeline kanine_pipeline.add_output(name='output', data_reference='steps.1.produce') return kanine_pipeline
def _prepend_pipeline(base: pipeline.Pipeline, prepend: pipeline.Pipeline) -> pipeline.Pipeline: # wrap pipeline in a sub pipeline - d3m core node replacement function doesn't work otherwise subpipeline = pipeline.SubpipelineStep(pipeline=base) # If there isn't a placeholder, return the prepended pipe if not [ True for s in prepend.steps if isinstance(s, pipeline.PlaceholderStep) ]: return prepend # find the placeholder node in the prepend and replace it with the base sub pipeline for i, step in enumerate(prepend.steps): if isinstance(step, pipeline.PlaceholderStep): # set inputs/outputs manually since the replace doesn't infer them for input_ref in step.inputs: subpipeline.add_input(input_ref) for output_id in step.outputs: subpipeline.add_output(output_id) prepend.replace_step(i, subpipeline) return prepend logger.warn( f'Failed to prepend pipeline {prepend.id} - continuing with base unmodified' ) return base
def create_pipeline(metric: str) -> Pipeline: # create the basic pipeline vertex_nomination_pipeline = Pipeline(context=PipelineContext.TESTING) vertex_nomination_pipeline.add_input(name='inputs') # step 0 - extract the graphs step = PrimitiveStep(primitive_description=DistilSingleGraphLoaderPrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step.add_output('produce') step.add_output('produce_target') vertex_nomination_pipeline.add_step(step) # step 1 - predict links step = PrimitiveStep(primitive_description=DistilLinkPredictionPrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce_target') step.add_hyperparameter('metric', ArgumentType.VALUE, metric) step.add_output('produce') vertex_nomination_pipeline.add_step(step) # Adding output step to the pipeline vertex_nomination_pipeline.add_output(name='output', data_reference='steps.1.produce') return vertex_nomination_pipeline
def create_pipeline(metric: str) -> Pipeline: # create the basic pipeline vertex_classification_pipeline = Pipeline(context=PipelineContext.TESTING) vertex_classification_pipeline.add_input(name='inputs') # step 0 - extract the graphs step = PrimitiveStep( primitive_description=VertexClassificationParser.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step.add_output('produce') vertex_classification_pipeline.add_step(step) # step 1 - classify step = PrimitiveStep( primitive_description=VertexClassification.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step.add_hyperparameter('jvm_memory', ArgumentType.VALUE, 0.6) step.add_output('produce') vertex_classification_pipeline.add_step(step) # Adding output step to the pipeline vertex_classification_pipeline.add_output(name='output', data_reference='steps.1.produce') return vertex_classification_pipeline
def load_pipeline(pipeline): with open(pipeline) as _pipeline: if pipeline.endswith('.json'): pipeline = Pipeline.from_json(_pipeline) else: pipeline = Pipeline.from_yaml(_pipeline) return pipeline
def create_pipeline_json(self, prim_dict): """ Generate pipeline.json """ name = "Pipeline for evaluation" pipeline_id = self.id #+ "_" + str(self.rank) pipeline_description = Pipeline(pipeline_id=pipeline_id, context=Context.EVALUATION, name=name) for ip in self.inputs: pipeline_description.add_input(name=ip['name']) num = self.num_steps() for i in range(num): p = prim_dict[self.primitives[i]] pdesc = {} pdesc['id'] = p.id pdesc['version'] = p.primitive_class.version pdesc['python_path'] = p.primitive_class.python_path pdesc['name'] = p.primitive_class.name pdesc['digest'] = p.primitive_class.digest step = PrimitiveStep(primitive_description=pdesc) for name, value in self.primitives_arguments[i].items(): origin = value['origin'] argument_type = ArgumentType.CONTAINER step.add_argument(name=name, argument_type=argument_type, data_reference=value['data']) step.add_output(output_id=p.primitive_class.produce_methods[0]) if self.hyperparams[i] is not None: for name, value in self.hyperparams[i].items(): step.add_hyperparameter(name=name, argument_type=ArgumentType.VALUE, data=value) pipeline_description.add_step(step) for op in self.outputs: pipeline_description.add_output(data_reference=op[2], name=op[3]) self.pipeline_description = pipeline_description
def __setstate__(self, state: typing.Dict) -> None: """ This method is used for unpickling the object. It takes a dictionary of saved state of object and restores the object to that state. Args: state: typing.Dict dictionary of the objects picklable state Returns: """ # print("[INFO] Set state called!") fitted = state['fitted_pipe'] del state['fitted_pipe'] structure = state['pipeline'] state['pipeline'] = Pipeline.from_json_structure(structure) random_seed = state['random_seed'] run = Runtime(state['pipeline'], fitted_pipeline_id=state['id'], random_seed=random_seed, volumes_dir=FittedPipeline.runtime_setting.volumes_dir, log_dir=FittedPipeline.runtime_setting.log_dir) run.steps_state = fitted state['runtime'] = run self.__dict__ = state
def load_pipeline(pipeline_file: typing.Union[str, typing.Dict]): """ Load pipeline from a pipeline URI Parameters ---------- pipeline_file: Union[str, dict] The URI pointing to a json file of pipeline or dict of string that is a pipeline Returns ------- pipeline: Pipeline An object of Pipeline """ if isinstance(pipeline_file, dict): try: with d3m_utils.silence(): pipeline = Pipeline.from_json_structure(pipeline_file) except: pipeline = None else: with d3m_utils.silence(): pipeline = get_pipeline(pipeline_path=pipeline_file, load_all_primitives=False) return pipeline
def load_pipeline(path, tunables=True, defaults=True): """Load a d3m json or yaml pipeline.""" if not os.path.exists(path): base_path = os.path.abspath(os.path.dirname(__file__)) path = os.path.join('templates', path) path = os.path.join(base_path, path) if not os.path.isfile(path): raise ValueError('Could not find pipeline: {}'.format(path)) LOGGER.warn('Loading pipeline from %s', path) with open(path) as pipeline: if path.endswith('yml'): data = yaml.safe_load(pipeline) else: data = json.load(pipeline) pipeline = Pipeline.from_json_structure(data) if tunables: # extract tunable hyperparameters tunable_hyperparameters = extract_tunable_hyperparams(pipeline) return pipeline, tunable_hyperparameters return pipeline
def generate_pipeline(pipeline_path: str, dataset_path: str, problem_doc_path: str, resolver: Resolver = None) -> Runtime: """ Simplified interface that fit a pipeline with a dataset Paramters --------- pipeline_path Path to the pipeline description dataset_path: Path to the datasetDoc.json problem_doc_path: Path to the problemDoc.json resolver : Resolver Resolver to use. """ # Pipeline description pipeline_description = None if '.json' in pipeline_path: with open(pipeline_path) as pipeline_file: pipeline_description = Pipeline.from_json( string_or_file=pipeline_file, resolver=resolver) else: with open(pipeline_path) as pipeline_file: pipeline_description = Pipeline.from_yaml( string_or_file=pipeline_file, resolver=resolver) # Problem Doc problem_doc = load_problem_doc(problem_doc_path) # Dataset if 'file:' not in dataset_path: dataset_path = 'file://{dataset_path}'.format( dataset_path=os.path.abspath(dataset_path)) dataset = D3MDatasetLoader().load(dataset_uri=dataset_path) # Adding Metadata to Dataset dataset = add_target_columns_metadata(dataset, problem_doc) # Pipeline pipeline_runtime = Runtime(pipeline_description) # Fitting Pipeline pipeline_runtime.fit(inputs=[dataset]) return pipeline_runtime
def _new_pipeline(pipeline, hyperparams=None): hyperparams = to_dicts(hyperparams) if hyperparams else dict() new_pipeline = Pipeline(context=Context.TESTING) for input_ in pipeline.inputs: new_pipeline.add_input(name=input_['name']) for step_id, old_step in enumerate(pipeline.steps): new_step = PrimitiveStep(primitive=old_step.primitive) for name, argument in old_step.arguments.items(): new_step.add_argument( name=name, argument_type=argument['type'], data_reference=argument['data'] ) for output in old_step.outputs: new_step.add_output(output) new_hyperparams = hyperparams.get(str(step_id), dict()) for name, hyperparam in old_step.hyperparams.items(): if name not in new_hyperparams: new_step.add_hyperparameter( name=name, argument_type=ArgumentType.VALUE, data=hyperparam['data'] ) for name, value in new_hyperparams.items(): new_step.add_hyperparameter( name=name, argument_type=ArgumentType.VALUE, data=value ) new_pipeline.add_step(new_step) for output in pipeline.outputs: new_pipeline.add_output( name=output['name'], data_reference=output['data'] ) new_pipeline.cv_scores = list() new_pipeline.score = None return new_pipeline
def generate_template(pipeline_file: str) -> dict: with open(pipeline_file) as f: pipeline = Pipeline.from_json(f) steps = [] for i, step in enumerate(pipeline.steps): if not isinstance(step, PrimitiveStep): raise ValueError('Can only handle PrimitiveSteps') step_name = f'steps.{i}' hyperparameters = {} for name, value in step.hyperparams.items(): if value['type'] == ArgumentType.VALUE: hyperparameters[name] = value['data'] else: raise ValueError( f'Do not know how to parse hyperparam: {str(value)}') arguments = [] argument_keys = set(step.arguments.keys()) for argument_name in ['inputs', 'outputs', 'reference']: if argument_name in argument_keys: argument_keys.remove(argument_name) if step.arguments[argument_name][ 'type'] == ArgumentType.CONTAINER: if step.arguments[argument_name]['data'] == 'inputs.0': arguments.append('template_input') elif step.arguments[argument_name]['data'].startswith( 'steps.') and step.arguments['inputs'][ 'data'].endswith('.produce'): arguments.append( step.arguments[argument_name]['data'][:-8]) else: raise ValueError( f"Do not know how to parse argument: {step.arguments['inputs']['data']}" ) else: raise ValueError( f"Do not know how to parse argument type: {step.arguments['inputs']['type']}" ) if len(argument_keys) > 0: for argument_name in argument_keys: print(argument_name, step.arguments[argument_name]) raise ValueError(f"Unused arguments: {argument_keys}") primitive = OrderedDict() primitive['primitive'] = str(step.primitive) primitive['hyperparameters'] = hyperparameters step = OrderedDict() step['name'] = step_name step['primitives'] = [primitive] step['inputs'] = arguments steps.append(step) template = OrderedDict() template['name'] = pipeline.id if pipeline.name is None else pipeline.name template['taskType'] = {'TaskType'} template['taskSubtype'] = {'TaskSubtype'} template['inputType'] = {'table'} template['output'] = step_name template['steps'] = steps return template
def create_pipeline(metric: str) -> Pipeline: # create the basic pipeline graph_matching_pipeline = Pipeline(context=PipelineContext.TESTING) graph_matching_pipeline.add_input(name='inputs') # step 0 - extract the graphs step = PrimitiveStep( primitive_description=DistilGraphLoaderPrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step.add_output('produce') step.add_output('produce_target') graph_matching_pipeline.add_step(step) # step 1 - match the graphs that have been seeded step = PrimitiveStep( primitive_description=DistilSeededGraphMatchingPrimitive.metadata. query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce_target') step.add_hyperparameter('metric', ArgumentType.VALUE, metric) step.add_output('produce') graph_matching_pipeline.add_step(step) # convert predictions to expected format #step = PrimitiveStep(primitive_description=ConstructPredictionsPrimitive.metadata.query()) #step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') #step.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce_target') #step.add_output('produce') #step.add_hyperparameter('use_columns', ArgumentType.VALUE, [0, 1]) #graph_matching_pipeline.add_step(step) # Adding output step to the pipeline graph_matching_pipeline.add_output(name='output', data_reference='steps.1.produce') return graph_matching_pipeline
def load_template(): with open( join(os.path.dirname(__file__), '../resource/pipelines/example_metalearningdb.json')) as fin: json_pipeline = json.load(fin) d3m_pipeline = Pipeline.from_json_structure(json_pipeline, ) grpc_pipeline = encode_pipeline_description(d3m_pipeline, ['RAW'], '/tmp') return grpc_pipeline
def build_pipeline(self, hyperparameters): """ hyperparameters example: { 'STEP5/d3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization/max_percent_null: 0, 'STEP7/d3m.primitives.data_preprocessing.robust_scaler.SKlearn/quantile_range: (2.798121390864261, 14.852664215409096), } """ values = hyperparameters.values pipeline_id = hyperparameters.get_pipeline_id() pipeline = copy.deepcopy(self.pipeline_candidates[pipeline_id]) pipeline.id = str(uuid.uuid4()) # update time pipeline.created = Pipeline().created skip_hps = set() # for key in sorted(values.keys()): for hp in hyperparameters.space: if hyperparameters.is_active(hp) and hp.name not in skip_hps and hp.name != PIPELINE_CHOICE: key = hp.name step, primitive_name, hp_name = hyperparameters.get_name_parts(key) value = values[key] step_idx = self.__get_step_idx_by_name(step) if step_idx is None: raise KeyError('{} not in the pipeline'.format(primitive_name)) primitive_step = pipeline.steps[step_idx] arg_type = ArgumentType.VALUE # In order to avoid the following error # Value '0' for hyper-parameter \ # 'STEP8/d3m.primitives.classification.xgboost_gbtree.DataFrameCommon/max_delta_step' \ # is not an instance of the structural type: typing.Union[int, NoneType] # Here is workaround if isinstance(value, np.int64): value = int(value) elif isinstance(value, np.str_): value = str(value) elif isinstance(value, np.bool_): value = bool(value) if hp_name in primitive_step.hyperparams: del primitive_step.hyperparams[hp_name] # Handle Choice if isinstance(hp, hyperparams.Choice): choice_cls = hp.choices[value] _vals = {} for name in choice_cls.configuration: if name == 'choice': _vals[name] = value else: _key = os.path.join(step, primitive_name, name) _vals[name] = values[_key] skip_hps.add(_key) value = choice_cls(_vals) primitive_step.add_hyperparameter(name=hp_name, argument_type=arg_type, data=value) return pipeline
def __init__(self, metadata, main_resource, data_types, loaded_primitives, problem=None, start_resource='inputs.0'): self.metadata = metadata self.main_resource = main_resource self.data_types = data_types self.loaded_primitives = loaded_primitives self.start_resource = start_resource self.problem = problem # Creating pipeline pipeline_description = Pipeline(context=Context.TESTING) pipeline_description.add_input(name='inputs') self.pipeline = pipeline_description self.d2d_step = None self.attr_step = None self.targ_step = None self._generate_pipeline()
def evaluate(pipeline, data_pipeline, dataset, metrics, problem, scoring_config, dataset_uri, timeout_run): if is_collection(dataset_uri[7:]): dataset = get_dataset_sample(dataset, problem) json_pipeline = convert.to_d3m_json(pipeline) if TaskKeyword.GRAPH in problem['problem'][ 'task_keywords'] and json_pipeline['description'].startswith( 'MtLDB'): return {0: {'ACCURACY': 1.0}, 1: {'ACCURACY': 1.0}} logger.info( "Pipeline to be scored:\n\t%s", '\n\t'.join( [x['primitive']['python_path'] for x in json_pipeline['steps']])) d3m_pipeline = Pipeline.from_json_structure(json_pipeline, ) if 'method' in scoring_config: scoring_config.pop('method') manager = Manager() return_dict = manager.dict() p = Process(target=worker, args=(d3m_pipeline, data_pipeline, scoring_pipeline, problem, dataset, scoring_config, metrics, return_dict)) p.start() p.join(timeout_run) p.terminate() if 'run_results' not in return_dict or 'run_scores' not in return_dict: raise TimeoutError('Reached timeout (%d seconds) to score a pipeline' % timeout_run) run_results = return_dict['run_results'] run_scores = return_dict['run_scores'] for result in run_results: if result.has_error(): raise RuntimeError(result.pipeline_run.status['message']) #save_pipeline_runs(run_results.pipeline_runs) combined_folds = d3m.runtime.combine_folds([fold for fold in run_scores]) scores = {} for _, row in combined_folds.iterrows(): if row['fold'] not in scores: scores[row['fold']] = {} scores[row['fold']][row['metric']] = row['value'] return scores
def DescribeSolution(self, request, context): solution_id = request.solution_id info_dict = self.get_from_stage_outputs("GetSearchSolutionsResults", solution_id) # Serialize the pipeline pipeline_json = info_dict["pipeline_json"] allowed_value_types = info_dict["allowed_value_types"] pipeline = Pipeline.from_json(pipeline_json) pipeline_description = ta3ta2utils.encode_pipeline_description( pipeline, allowed_value_types, "/tmp") return core_pb2.DescribeSolutionResponse(pipeline=pipeline_description)
def keras2pipeline(keras_model, batch_size=32): # Creating pipeline from tensorflow.python.keras.activations import softmax pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') set_data(pipeline_description) set_loss(pipeline_description) offset = len(pipeline_description.steps) previous_layer_ids = get_previous_layer_ids(keras_model) layers = keras_model.layers step_id = 0 layer_to_step_id = {} total_layer_num = len(layers) for i, layer in enumerate(layers): cls_name = get_layer_class_name(layer) if cls_name in OMIT_LAYERS: continue layer_id = get_layer_id(layer) if len(previous_layer_ids[layer_id]) > 0: layer.previous_layer_ids = tuple( layer_to_step_id[i] + offset for i in previous_layer_ids[layer_id] ) else: layer.previous_layer_ids = [None] # Since JPL does not support Softmax Layer, we add the workaround to make use of softmax if i == total_layer_num - 2 and cls_name == 'Dense': layer.activation = softmax d3m_step = step_function[cls_name](step_id, layer) pipeline_description.add_step(d3m_step) layer_to_step_id[layer_id] = step_id step_id += 1 set_learner(pipeline_description, batch_size) set_prediction(pipeline_description) pipeline_description.add_output( name='output predictions', data_reference=f"steps.{len(pipeline_description.steps) - 1}.produce") return pipeline_description
def load_pipeline_architecture(self, pipeline_architecture_dict): """ Loads pipeline architecture dictionary and returns a d3m Pipeline object. Return pipeline """ pipeline_description = Pipeline(context=Context.TESTING) pipeline_description.add_input(name='inputs') # For each corresponding stage in the dictionary create a step steps = [] stage_name_to_reference_name = {} for stage_dict in pipeline_architecture_dict: # Extract stage attributes primitive = stage_dict["primitive"] if type(primitive) == str: primitive = get_primitive_with_name(primitive) cur_stage_name = stage_dict["stage_name"] input_stage = stage_dict["input"] # Create primitive step step = PrimitiveStep(primitive_description=primitive.metadata.query()) data_reference = "inputs.0" if input_stage == PipelineWrapper.PIPELINE_INPUT else stage_name_to_reference_name[input_stage] step.add_argument(name="inputs", argument_type=ArgumentType.CONTAINER, data_reference=data_reference) if "hyperparameters" in stage_dict: for k,v in stage_dict["hyperparameters"].items(): step.add_hyperparameter(name=k, argument_type=ArgumentType.VALUE, data=v) if "arguments" in stage_dict: for k,v in stage_dict["arguments"].items(): step.add_argument(name=k, argument_type=ArgumentType.CONTAINER, data_reference=stage_name_to_reference_name[v]) step.add_output("produce") pipeline_description.add_step(step) reference_name = next(iter(step.get_output_data_references())) # Update accounting stage_name_to_reference_name[cur_stage_name] = reference_name steps.append(step) # Output is output of the last step last_output_reference = next(iter(steps[-1].get_output_data_references())) pipeline_description.add_output(name="output", data_reference=last_output_reference) return pipeline_description
def load_schema_only( cls, pipeline_id: str, folder_loc: str, pipeline_schema_subdir: str ) -> typing.Tuple[Pipeline, typing.Dict]: pipeline_dir = os.path.join(folder_loc, pipeline_schema_subdir) subpipeline_dir = os.path.join(folder_loc, cls.subpipelines_subdir) pipeline_schema = os.path.join(pipeline_dir, pipeline_id + '.json') with open(pipeline_schema, 'r') as f: structure = json.load(f) resolver = Resolver( pipeline_search_paths=[pipeline_dir, subpipeline_dir]) pipeline = Pipeline.from_json_structure(pipeline_description=structure, resolver=resolver) return (pipeline, structure)
def score(self, input_item): problem_doc, metric, pipeline_json, dataset_train, dataset_test = input_item[ 1:] # Run pipeline pipeline = Pipeline.from_json(pipeline_json) pipeline_runtime = Runtime(pipeline, context=Context.TESTING) pipeline_runtime.fit(inputs=[dataset_train], return_values=['outputs.0']) score_predictions = pipeline_runtime.produce( inputs=[dataset_test], return_values=['outputs.0']) score_predictions = score_predictions.values['outputs.0'] # Evaluate scores on score dir achieved_score = utils.train_utils.score(score_predictions, dataset_test, problem_doc, override_metric_key=metric) return achieved_score
def fitproduce(self, input_item): problem_doc, pipeline_json, dataset_train, dataset_test = input_item[ 1:] # Run pipeline pipeline = Pipeline.from_json(pipeline_json) pipeline_runtime = Runtime(pipeline, context=Context.TESTING) pipeline_runtime.fit(inputs=[dataset_train], return_values=['outputs.0']) score_predictions = pipeline_runtime.produce( inputs=[dataset_test], return_values=['outputs.0']) score_predictions = score_predictions.values['outputs.0'] # Write predictions to output path path = self.get_predictions_save_path() utils.utils.write_predictions_to_file(score_predictions, path, problem_doc) path_uri = "file://%s" % path return path_uri
def _return_pipelines(self, task_type, task_subtype, data_type): """ A function that return predefined pipelines given a task type. Returns ------- A predefined pipelines if there are pipelines left, also if there is template returns the new pipeline with the template. """ # TODO incorporate task_subtype and data_type for future problems with open(schemas_utils.PIPELINES_DB_DIR) as file: possible_pipelines_dict = json.load(file) if task_type not in possible_pipelines_dict: self.pipeline_left = False return None possible_pipelines_dict = possible_pipelines_dict[task_type] if not possible_pipelines_dict: return [] possible_pipelines = [] for pipeline_dict in possible_pipelines_dict: try: pipeline = pipeline_utils.load_pipeline(pipeline_dict) # update id pipeline.id = str(uuid.uuid4()) # update time pipeline.created = Pipeline().created possible_pipelines.append(pipeline) except Exception: pass return possible_pipelines
from d3m import index from d3m.metadata.base import ArgumentType, Context from d3m.metadata.pipeline import Pipeline, PrimitiveStep import sys # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') # Step 0: Denormalize primitive step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.denormalize.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: DISTIL/NK Storc primitive step_1 = PrimitiveStep( primitive=index.get_primitive('d3m.primitives.clustering.k_means.Sloth')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_hyperparameter(name='nclusters', argument_type=ArgumentType.VALUE, data=10) step_1.add_hyperparameter(name='long_format', argument_type=ArgumentType.VALUE, data=True) step_1.add_output('produce')
def generate_only(): # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') # Step 0: dataset_to_dataframe step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: column_parser step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.schema_discovery.profiler.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: column_parser step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common')) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline_description.add_step(step_2) # Step 3: DFS Single Table step_3 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization' )) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_output('produce') pipeline_description.add_step(step_3) # Step 4: learn model step_4 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.regression.xgboost_gbtree.Common')) step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') step_4.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_4.add_output('produce') pipeline_description.add_step(step_4) # Step 5: construct output step_5 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.construct_predictions.Common')) step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') step_5.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_5.add_output('produce') pipeline_description.add_step(step_5) # Final Output pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') # Generate .yml file for the pipeline import featuretools_ta1 from pipeline_tests.utils import generate_pipeline dataset_name = 'LL1_retail_sales_total_MIN_METADATA' dataset_path = '/featuretools_ta1/datasets/seed_datasets_current' primitive_name = 'd3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization' version = featuretools_ta1.__version__ test_name = os.path.splitext(os.path.basename(__file__))[0][5:] yml, pipeline_run_file = generate_pipeline( primitive_name=primitive_name, pipeline_description=pipeline_description, dataset_name=dataset_name, test_name=test_name) # fit-score command fs_cmd = 'python3 -m d3m runtime -d /featuretools_ta1/datasets/ fit-score -p {}'.format( yml) fs_cmd += ' -r {}/{}/{}_problem/problemDoc.json'.format( dataset_path, dataset_name, dataset_name) fs_cmd += ' -i {}/{}/TRAIN/dataset_TRAIN/datasetDoc.json'.format( dataset_path, dataset_name) fs_cmd += ' -t {}/{}/TEST/dataset_TEST/datasetDoc.json'.format( dataset_path, dataset_name) fs_cmd += ' -a {}/{}/SCORE/dataset_SCORE/datasetDoc.json'.format( dataset_path, dataset_name) fs_cmd += ' -O {}'.format(pipeline_run_file) # Run pipeline to save pipeline_run file os.system(fs_cmd) # Create and return command for running from pipeline_run file: pipeline_run_cmd = 'python3 -m d3m --pipelines-path /featuretools_ta1/MIT_FeatureLabs/{}/{}/pipelines/'.format( primitive_name, version) pipeline_run_cmd += ' runtime -d /featuretools_ta1/datasets/ fit-score' pipeline_run_cmd += ' -u {}'.format(pipeline_run_file) return pipeline_run_cmd
def __init__(self): pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # DS to DF on input DS step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common")) step.add_argument(name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0") step.add_output("produce") pipeline_description.add_step(step) # Simon step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_cleaning.column_type_profiler.Simon")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # column parser on input DF step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.column_parser.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_output("produce") pipeline_description.add_step(step) # XG Boost step = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.classification.xgboost_gbtree.Common')) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step.add_output('produce') step.add_hyperparameter(name='add_index_columns', argument_type=ArgumentType.VALUE, data=True) pipeline_description.add_step(step) # construct predictions step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.construct_predictions.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.3.produce", ) step.add_argument( name="reference", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Final Output pipeline_description.add_output(name="output predictions", data_reference="steps.4.produce") self.pipeline = pipeline_description
def build_pipeline(pipepline_info, pipepline_mapping, stdout=None): default_stdout = sys.stdout if stdout is not None: sys.stdout = stdout # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') for primitive_info in pipepline_info: print(primitive_info.python_path) print(primitive_info.hyperparameter) print(primitive_info.ancestors) if primitive_info.python_path == 'HEAD': dataset_fullname = primitive_info.hyperparameter['dataset_folder'] print(dataset_fullname) continue elif primitive_info.python_path == 'ENDING': ancestors = primitive_info.ancestors end_step_num = pipepline_mapping[ancestors['inputs']] - 1 pipeline_description.add_output(name='output predictions', data_reference='steps.' + str(end_step_num) + '.produce') else: # print(primitive_info.python_path) primitive = index.get_primitive(primitive_info.python_path) step = PrimitiveStep(primitive=primitive) hyperparameters = primitive_info.hyperparameter ancestors = primitive_info.ancestors # add add_inputs # print(ancestors) if ancestors['inputs'] != 0: for ances_key in ancestors.keys(): print(ances_key, ancestors[ances_key], pipepline_mapping[ancestors[ances_key]] - 1) step_num = pipepline_mapping[ancestors[ances_key]] - 1 step.add_argument(name=ances_key, argument_type=ArgumentType.CONTAINER, data_reference='steps.' + str(step_num) + '.produce') else: step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') # add add_hyperparameter for hyper in hyperparameters.keys(): # print(hyper, hyperparameters[hyper], type(hyperparameters[hyper])) hyper_value = hyperparameters[hyper] step.add_hyperparameter(name=hyper, argument_type=ArgumentType.VALUE, data=hyper_value) step.add_output('produce') pipeline_description.add_step(step) # print('\n') # Output to json data = pipeline_description.to_json() with open('example_pipeline.json', 'w') as f: f.write(data) print(data) # yaml = pipeline_description.to_yaml() # with open('example_pipeline.yml', 'w') as f: # f.write(yaml) # print(yaml) sys.stdout.flush() sys.stdout = default_stdout
from d3m import index from d3m.metadata.base import ArgumentType from d3m.metadata.pipeline import Pipeline, PrimitiveStep from d3m.metadata import hyperparams import numpy as np # -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest # extract_columns_by_semantic_types(targets) -> ^ # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') # Step 0: dataset_to_dataframe primitive_0 = index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common') step_0 = PrimitiveStep(primitive=primitive_0) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # # Step 1: column_parser primitive_1 = index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common') step_1 = PrimitiveStep(primitive=primitive_1) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce')
def __init__( self, epochs: int = 5000, attention_lstm: bool = True, ): pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # Ts formatter step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.time_series_formatter.DistilTimeSeriesFormatter" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0", ) step.add_output("produce") pipeline_description.add_step(step) # DS to DF on formatted ts DS step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # DS to DF on input DS step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0", ) step.add_output("produce") pipeline_description.add_step(step) # column parser on input DF step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.column_parser.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.2.produce", ) step.add_output("produce") step.add_hyperparameter( name="parse_semantic_types", argument_type=ArgumentType.VALUE, data=[ "http://schema.org/Boolean", "http://schema.org/Integer", "http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/FloatVector", ], ) pipeline_description.add_step(step) # parse target semantic types step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.3.produce", ) step.add_hyperparameter( name="semantic_types", argument_type=ArgumentType.VALUE, data=[ "https://metadata.datadrivendiscovery.org/types/Target", ], ) step.add_output("produce") pipeline_description.add_step(step) # LSTM FCN step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.time_series_classification.convolutional_neural_net.LSTM_FCN" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_argument( name="outputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.4.produce", ) step.add_hyperparameter( name="epochs", argument_type=ArgumentType.VALUE, data=epochs ) step.add_hyperparameter( name="attention_lstm", argument_type=ArgumentType.VALUE, data=attention_lstm ) step.add_output("produce") pipeline_description.add_step(step) # construct predictions step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.construct_predictions.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.5.produce", ) step.add_argument( name="reference", argument_type=ArgumentType.CONTAINER, data_reference="steps.2.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Final Output pipeline_description.add_output( name="output predictions", data_reference="steps.6.produce" ) self.pipeline = pipeline_description