def create_pipeline(metric: str) -> Pipeline: # create the basic pipeline vertex_classification_pipeline = Pipeline(context=PipelineContext.TESTING) vertex_classification_pipeline.add_input(name='inputs') # step 0 - extract the graphs step = PrimitiveStep( primitive_description=VertexClassificationParser.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step.add_output('produce') vertex_classification_pipeline.add_step(step) # step 1 - classify step = PrimitiveStep( primitive_description=VertexClassification.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step.add_hyperparameter('jvm_memory', ArgumentType.VALUE, 0.6) step.add_output('produce') vertex_classification_pipeline.add_step(step) # Adding output step to the pipeline vertex_classification_pipeline.add_output(name='output', data_reference='steps.1.produce') return vertex_classification_pipeline
def add_primitive_to_pipeline(self, primitive, attributes, hyperparameters=[], targets=None, produce_collection=False): inputs_ref = attributes if isinstance( attributes, str) else self.get_output_str(attributes) step = PrimitiveStep(primitive=primitive, resolver=self.resolver) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=inputs_ref) for hyperparam in hyperparameters: name, argument_type, data = hyperparam step.add_hyperparameter(name=name, argument_type=argument_type, data=data) if targets: outputs_ref = targets if isinstance( targets, str) else self.get_output_str(targets) step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=outputs_ref) step.add_output('produce') if produce_collection: step.add_output('produce_collection') self.pipeline.add_step(step) return step
def create_pipeline(metric: str) -> Pipeline: # create the basic pipeline vertex_nomination_pipeline = Pipeline(context=PipelineContext.TESTING) vertex_nomination_pipeline.add_input(name='inputs') # step 0 - extract the graphs step = PrimitiveStep(primitive_description=DistilSingleGraphLoaderPrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step.add_output('produce') step.add_output('produce_target') vertex_nomination_pipeline.add_step(step) # step 1 - predict links step = PrimitiveStep(primitive_description=DistilLinkPredictionPrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce_target') step.add_hyperparameter('metric', ArgumentType.VALUE, metric) step.add_output('produce') vertex_nomination_pipeline.add_step(step) # Adding output step to the pipeline vertex_nomination_pipeline.add_output(name='output', data_reference='steps.1.produce') return vertex_nomination_pipeline
def create_pipeline_json(self, prim_dict): """ Generate pipeline.json """ name = "Pipeline for evaluation" pipeline_id = self.id #+ "_" + str(self.rank) pipeline_description = Pipeline(pipeline_id=pipeline_id, context=Context.EVALUATION, name=name) for ip in self.inputs: pipeline_description.add_input(name=ip['name']) num = self.num_steps() for i in range(num): p = prim_dict[self.primitives[i]] pdesc = {} pdesc['id'] = p.id pdesc['version'] = p.primitive_class.version pdesc['python_path'] = p.primitive_class.python_path pdesc['name'] = p.primitive_class.name pdesc['digest'] = p.primitive_class.digest step = PrimitiveStep(primitive_description=pdesc) for name, value in self.primitives_arguments[i].items(): origin = value['origin'] argument_type = ArgumentType.CONTAINER step.add_argument(name=name, argument_type=argument_type, data_reference=value['data']) step.add_output(output_id=p.primitive_class.produce_methods[0]) if self.hyperparams[i] is not None: for name, value in self.hyperparams[i].items(): step.add_hyperparameter(name=name, argument_type=ArgumentType.VALUE, data=value) pipeline_description.add_step(step) for op in self.outputs: pipeline_description.add_output(data_reference=op[2], name=op[3]) self.pipeline_description = pipeline_description
def __init__(self): pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # Ts formatter step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.time_series_formatter.DistilTimeSeriesFormatter" )) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0", ) step.add_output("produce") pipeline_description.add_step(step) # DS to DF on formatted ts DS step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Grouping Field Compose step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.grouping_field_compose.Common") ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Storc primitive -> KMeans step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.clustering.k_means.Sloth")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.2.produce", ) step.add_hyperparameter(name="nclusters", argument_type=ArgumentType.VALUE, data=3) step.add_output("produce") pipeline_description.add_step(step) # Final Output pipeline_description.add_output(name="output predictions", data_reference="steps.3.produce") self.pipeline = pipeline_description
def _gen_pipeline(self): pipeline = meta_pipeline.Pipeline() pipeline.add_input(name='inputs') # Step 0: dataset_to_dataframe step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Step 1: dataset_to_dataframe step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_1.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='1') step_1.add_output('produce') pipeline.add_step(step_1) # Step 2: dataset_to_dataframe step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_2.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='2') step_2.add_output('produce') pipeline.add_step(step_2) step_3 = meta_pipeline.PrimitiveStep( primitive_description=EuclideanNomination.metadata.query()) step_3.add_argument(name='inputs_1', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_3.add_argument(name='inputs_2', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_3.add_output('produce') pipeline.add_step(step_3) # Adding output step to the pipeline pipeline.add_output(name='Predictions', data_reference='steps.3.produce') return pipeline
def build_demo_pipeline(): # Creating pipeline pipeline = Pipeline(context=Context.TESTING) pipeline.add_input(name='inputs') # Step 0: DFS step_0 = PrimitiveStep(primitive_description=Featuretools.metadata.query()) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline.add_step(step_0) # Step 1: SKlearnImputer step_1 = PrimitiveStep( primitive_description=SKlearnImputer.metadata.query()) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline.add_step(step_1) # Step 2: SKlearnRFC step_2 = PrimitiveStep(primitive_description=SKlearnRFC.metadata.query()) step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) step_2.add_hyperparameter(name='add_index_columns', argument_type=ArgumentType.VALUE, data=True) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline.add_step(step_2) # Step 3: ConstructPredictions step_3 = PrimitiveStep( primitive_description=DataFrameCommon.metadata.query()) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_output('produce') pipeline.add_step(step_3) # Final Output pipeline.add_output(name='output predictions', data_reference='steps.3.produce') return pipeline
def community_detection(resolver=None): if resolver is None: resolver = custom_resolver.BlackListResolver() # Creating Pipeline pipeline_description = Pipeline(context=PipelineContext.TESTING) pipeline_description.add_input(name='inputs') start_step = "inputs.0" # Step 0 step_0 = PrimitiveStep(primitive_description=d3m.primitives.sri.graph. CommunityDetectionParser.metadata.query(), resolver=resolver) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=start_step) step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1 step_1 = PrimitiveStep(primitive_description=d3m.primitives.sri.psl. CommunityDetection.metadata.query(), resolver=resolver) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_hyperparameter(name='jvm_memory', argument_type=ArgumentType.VALUE, data=0.5) step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: RemoveColumns step_2 = PrimitiveStep(primitive_description=d3m.primitives.data. RemoveColumns.metadata.query(), resolver=resolver) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_hyperparameter(name='columns', argument_type=ArgumentType.VALUE, data=[0]) step_2.add_output('produce') pipeline_description.add_step(step_2) pipeline_description.add_output(name='Result', data_reference='steps.2.produce') last_step = len(pipeline_description.steps) - 1 attributes = pipelines.int_to_step(last_step - 1) targets = pipelines.int_to_step(last_step) return pipeline_description
def create_pipeline(metric: str) -> Pipeline: previous_step = 0 input_val = 'steps.{}.produce' # create the basic pipeline var_pipeline = Pipeline(context=PipelineContext.TESTING) var_pipeline.add_input(name='inputs') # step 0 - Extract dataframe from dataset step = PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step.add_output('produce') var_pipeline.add_step(step) # step 1 - Parse columns. step = PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step.add_output('produce') semantic_types = ( 'http://schema.org/Boolean', 'http://schema.org/Integer', 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/FloatVector') step.add_hyperparameter('parse_semantic_types', ArgumentType.VALUE, semantic_types) var_pipeline.add_step(step) # step 2 - Vector Auto Regression step = PrimitiveStep(primitive_description=VAR.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step.add_output('produce') var_pipeline.add_step(step) # Adding output step to the pipeline var_pipeline.add_output(name='output', data_reference='steps.2.produce') return var_pipeline
def _new_pipeline(pipeline, hyperparams=None): hyperparams = to_dicts(hyperparams) if hyperparams else dict() new_pipeline = Pipeline(context=Context.TESTING) for input_ in pipeline.inputs: new_pipeline.add_input(name=input_['name']) for step_id, old_step in enumerate(pipeline.steps): new_step = PrimitiveStep(primitive=old_step.primitive) for name, argument in old_step.arguments.items(): new_step.add_argument( name=name, argument_type=argument['type'], data_reference=argument['data'] ) for output in old_step.outputs: new_step.add_output(output) new_hyperparams = hyperparams.get(str(step_id), dict()) for name, hyperparam in old_step.hyperparams.items(): if name not in new_hyperparams: new_step.add_hyperparameter( name=name, argument_type=ArgumentType.VALUE, data=hyperparam['data'] ) for name, value in new_hyperparams.items(): new_step.add_hyperparameter( name=name, argument_type=ArgumentType.VALUE, data=value ) new_pipeline.add_step(new_step) for output in pipeline.outputs: new_pipeline.add_output( name=output['name'], data_reference=output['data'] ) new_pipeline.cv_scores = list() new_pipeline.score = None return new_pipeline
def load_pipeline_architecture(self, pipeline_architecture_dict): """ Loads pipeline architecture dictionary and returns a d3m Pipeline object. Return pipeline """ pipeline_description = Pipeline(context=Context.TESTING) pipeline_description.add_input(name='inputs') # For each corresponding stage in the dictionary create a step steps = [] stage_name_to_reference_name = {} for stage_dict in pipeline_architecture_dict: # Extract stage attributes primitive = stage_dict["primitive"] if type(primitive) == str: primitive = get_primitive_with_name(primitive) cur_stage_name = stage_dict["stage_name"] input_stage = stage_dict["input"] # Create primitive step step = PrimitiveStep(primitive_description=primitive.metadata.query()) data_reference = "inputs.0" if input_stage == PipelineWrapper.PIPELINE_INPUT else stage_name_to_reference_name[input_stage] step.add_argument(name="inputs", argument_type=ArgumentType.CONTAINER, data_reference=data_reference) if "hyperparameters" in stage_dict: for k,v in stage_dict["hyperparameters"].items(): step.add_hyperparameter(name=k, argument_type=ArgumentType.VALUE, data=v) if "arguments" in stage_dict: for k,v in stage_dict["arguments"].items(): step.add_argument(name=k, argument_type=ArgumentType.CONTAINER, data_reference=stage_name_to_reference_name[v]) step.add_output("produce") pipeline_description.add_step(step) reference_name = next(iter(step.get_output_data_references())) # Update accounting stage_name_to_reference_name[cur_stage_name] = reference_name steps.append(step) # Output is output of the last step last_output_reference = next(iter(steps[-1].get_output_data_references())) pipeline_description.add_output(name="output", data_reference=last_output_reference) return pipeline_description
def create_pipeline(metric: str) -> Pipeline: # create the basic pipeline graph_matching_pipeline = Pipeline(context=PipelineContext.TESTING) graph_matching_pipeline.add_input(name='inputs') # step 0 - extract the graphs step = PrimitiveStep( primitive_description=DistilGraphLoaderPrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step.add_output('produce') step.add_output('produce_target') graph_matching_pipeline.add_step(step) # step 1 - match the graphs that have been seeded step = PrimitiveStep( primitive_description=DistilSeededGraphMatchingPrimitive.metadata. query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce_target') step.add_hyperparameter('metric', ArgumentType.VALUE, metric) step.add_output('produce') graph_matching_pipeline.add_step(step) # convert predictions to expected format #step = PrimitiveStep(primitive_description=ConstructPredictionsPrimitive.metadata.query()) #step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') #step.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce_target') #step.add_output('produce') #step.add_hyperparameter('use_columns', ArgumentType.VALUE, [0, 1]) #graph_matching_pipeline.add_step(step) # Adding output step to the pipeline graph_matching_pipeline.add_output(name='output', data_reference='steps.1.produce') return graph_matching_pipeline
def create_pipeline(metric: str) -> Pipeline: previous_step = 0 input_val = 'steps.{}.produce' # create the basic pipeline tsf_pipeline = Pipeline(context=PipelineContext.TESTING) tsf_pipeline.add_input(name='inputs') # step 0 - Extract dataframe from dataset step = PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step.add_output('produce') tsf_pipeline.add_step(step) # step 1 - Parse columns. step = PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step.add_output('produce') semantic_types = ( 'http://schema.org/Boolean', 'http://schema.org/Integer', 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/FloatVector') step.add_hyperparameter('parse_semantic_types', ArgumentType.VALUE, semantic_types) tsf_pipeline.add_step(step) # step 2 - Parrot ARIMA step = PrimitiveStep(primitive_description=Parrot.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step.add_hyperparameter(name='seasonal_differencing', argument_type=ArgumentType.VALUE, data=11) step.add_hyperparameter(name='n_periods', argument_type=ArgumentType.VALUE, data=21) step.add_output('produce') tsf_pipeline.add_step(step) # step 3 - convert predictions to expected format # step = PrimitiveStep(primitive_description=ConstructPredictionsPrimitive.metadata.query()) # step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') # step.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') # step.add_output('produce') # tsf_pipeline.add_step(step) # Adding output step to the pipeline tsf_pipeline.add_output(name='output', data_reference='steps.2.produce') return tsf_pipeline
def __init__(self, epochs: int = 10, n_steps: int = 20): pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # Denormalize primitive step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.denormalize.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0", ) step.add_output("produce") pipeline_description.add_step(step) # DS to DF on input DS step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # RetinaNet primitive step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.object_detection.retina_net.ObjectDetectionRN")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_argument( name="outputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_hyperparameter(name="n_epochs", argument_type=ArgumentType.VALUE, data=epochs) step.add_hyperparameter(name="n_steps", argument_type=ArgumentType.VALUE, data=n_steps) step.add_hyperparameter(name="weights_path", argument_type=ArgumentType.VALUE, data="/scratch_dir/") step.add_output("produce") pipeline_description.add_step(step) # Final Output pipeline_description.add_output(name="output predictions", data_reference="steps.2.produce") self.pipeline = pipeline_description
# Step 2: column parser on input DF step_2 = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.column_parser.Common")) step_2.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step_2.add_output("produce") step_2.add_hyperparameter( name="parse_semantic_types", argument_type=ArgumentType.VALUE, data=[ "http://schema.org/Boolean", "http://schema.org/Integer", "http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/FloatVector", "http://schema.org/DateTime", ], ) pipeline_description.add_step(step_2) # Step 3: parse attribute and index semantic types step_3 = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common" )) step_3.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.2.produce",
# Step 1: Column Parser primitive_1 = index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common') step_1 = PrimitiveStep(primitive=primitive_1) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: Discrete Cosine Transform primitive_2 = index.get_primitive( 'd3m.primitives.tods.feature_analysis.discrete_cosine_transform') step_2 = PrimitiveStep(primitive=primitive_2) step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2, 3, 4)) step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline_description.add_step(step_2) # Final Output pipeline_description.add_output(name='output predictions',
step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.denormalize.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: dataset_to_dataframe step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='learningData') step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: DISTIL/NK data cleaning step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_cleaning.data_cleaning.Datacleaning')) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline_description.add_step(step_2) # Step 3: column_parser step_3 = PrimitiveStep(primitive=index.get_primitive(
step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.denormalize.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: dataset_to_dataframe step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='learningData') step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: DISTIL/NK VAR primitive step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.time_series_forecasting.arima.Parrot')) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_hyperparameter(name='datetime_index', argument_type=ArgumentType.VALUE,
# # Step 1: column_parser primitive_1 = index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common') step_1 = PrimitiveStep(primitive=primitive_1) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline_description.add_step(step_1) # # Step 2: Standardization primitive_2 = index.get_primitive( 'd3m.primitives.tods.timeseries_processing.transformation.standard_scaler') step_2 = PrimitiveStep(primitive=primitive_2) step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2, 3, 4, 5, 6)) step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline_description.add_step(step_2) # # Step 3: test primitive # primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive')
step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.denormalize.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: dataset_to_dataframe step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='learningData') step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: DISTIL/NK data cleaning step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_cleaning.data_cleaning.Datacleaning')) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline_description.add_step(step_2) # Step 3: column_parser step_3 = PrimitiveStep(primitive=index.get_primitive(
step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # # Step 1: column_parser primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') step_1 = PrimitiveStep(primitive=primitive_1) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline_description.add_step(step_1) # # Step 2: test primitive primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_mogaal') step_2 = PrimitiveStep(primitive=primitive_2) step_2.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) # There is sth wrong with multi-dimensional step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline_description.add_step(step_2) # Final Output pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') # Output to YAML yaml = pipeline_description.to_yaml() with open('pipeline.yml', 'w') as f: f.write(yaml) print(yaml)
def __init__(self): pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # DS to DF on input DS step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common")) step.add_argument(name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0") step.add_output("produce") pipeline_description.add_step(step) # Simon step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_cleaning.column_type_profiler.Simon")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # column parser on input DF step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.column_parser.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_output("produce") pipeline_description.add_step(step) # XG Boost step = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.classification.xgboost_gbtree.Common')) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step.add_output('produce') step.add_hyperparameter(name='add_index_columns', argument_type=ArgumentType.VALUE, data=True) pipeline_description.add_step(step) # construct predictions step = PrimitiveStep(primitive=index.get_primitive( "d3m.primitives.data_transformation.construct_predictions.Common")) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.3.produce", ) step.add_argument( name="reference", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Final Output pipeline_description.add_output(name="output predictions", data_reference="steps.4.produce") self.pipeline = pipeline_description
step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: extract_columns_by_semantic_types(attributes) step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common' )) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') step_2.add_hyperparameter( name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) pipeline_description.add_step(step_2) # # Step 3: Standardization primitive_3 = index.get_primitive( 'd3m.primitives.tods.timeseries_processing.transformation.standard_scaler') step_3 = PrimitiveStep(primitive=primitive_3) step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=( 1, 2,
def __init__( self, epochs: int = 5000, attention_lstm: bool = True, ): pipeline_description = Pipeline() pipeline_description.add_input(name="inputs") # Ts formatter step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.time_series_formatter.DistilTimeSeriesFormatter" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0", ) step.add_output("produce") pipeline_description.add_step(step) # DS to DF on formatted ts DS step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.0.produce", ) step.add_output("produce") pipeline_description.add_step(step) # DS to DF on input DS step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.dataset_to_dataframe.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0", ) step.add_output("produce") pipeline_description.add_step(step) # column parser on input DF step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.column_parser.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.2.produce", ) step.add_output("produce") step.add_hyperparameter( name="parse_semantic_types", argument_type=ArgumentType.VALUE, data=[ "http://schema.org/Boolean", "http://schema.org/Integer", "http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/FloatVector", ], ) pipeline_description.add_step(step) # parse target semantic types step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.3.produce", ) step.add_hyperparameter( name="semantic_types", argument_type=ArgumentType.VALUE, data=[ "https://metadata.datadrivendiscovery.org/types/Target", ], ) step.add_output("produce") pipeline_description.add_step(step) # LSTM FCN step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.time_series_classification.convolutional_neural_net.LSTM_FCN" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.1.produce", ) step.add_argument( name="outputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.4.produce", ) step.add_hyperparameter( name="epochs", argument_type=ArgumentType.VALUE, data=epochs ) step.add_hyperparameter( name="attention_lstm", argument_type=ArgumentType.VALUE, data=attention_lstm ) step.add_output("produce") pipeline_description.add_step(step) # construct predictions step = PrimitiveStep( primitive=index.get_primitive( "d3m.primitives.data_transformation.construct_predictions.Common" ) ) step.add_argument( name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="steps.5.produce", ) step.add_argument( name="reference", argument_type=ArgumentType.CONTAINER, data_reference="steps.2.produce", ) step.add_output("produce") pipeline_description.add_step(step) # Final Output pipeline_description.add_output( name="output predictions", data_reference="steps.6.produce" ) self.pipeline = pipeline_description
'd3m.primitives.data_transformation.column_parser.Common')) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_output('produce') pipeline_description.add_step(step_3) # Step 4: imputer step_4 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_cleaning.imputer.SKlearn')) step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') step_4.add_output('produce') step_4.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='replace') step_4.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) pipeline_description.add_step(step_4) # Step 5: random_forest step_5 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.classification.xgboost_gbtree.Common')) step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') step_5.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.denormalize.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: dataset_to_dataframe step_1 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.dataset_to_dataframe.Common')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_hyperparameter(name='dataframe_resource', argument_type=ArgumentType.VALUE, data='learningData') step_1.add_output('produce') pipeline_description.add_step(step_1) # Step 2: DISTIL/NK VAR primitive step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.time_series_forecasting.arima.Parrot')) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_hyperparameter(name='n_periods', argument_type=ArgumentType.VALUE,
def build_pipeline(pipepline_info, pipepline_mapping, stdout=None): default_stdout = sys.stdout if stdout is not None: sys.stdout = stdout # Creating pipeline pipeline_description = Pipeline() pipeline_description.add_input(name='inputs') for primitive_info in pipepline_info: print(primitive_info.python_path) print(primitive_info.hyperparameter) print(primitive_info.ancestors) if primitive_info.python_path == 'HEAD': dataset_fullname = primitive_info.hyperparameter['dataset_folder'] print(dataset_fullname) continue elif primitive_info.python_path == 'ENDING': ancestors = primitive_info.ancestors end_step_num = pipepline_mapping[ancestors['inputs']] - 1 pipeline_description.add_output(name='output predictions', data_reference='steps.' + str(end_step_num) + '.produce') else: # print(primitive_info.python_path) primitive = index.get_primitive(primitive_info.python_path) step = PrimitiveStep(primitive=primitive) hyperparameters = primitive_info.hyperparameter ancestors = primitive_info.ancestors # add add_inputs # print(ancestors) if ancestors['inputs'] != 0: for ances_key in ancestors.keys(): print(ances_key, ancestors[ances_key], pipepline_mapping[ancestors[ances_key]] - 1) step_num = pipepline_mapping[ancestors[ances_key]] - 1 step.add_argument(name=ances_key, argument_type=ArgumentType.CONTAINER, data_reference='steps.' + str(step_num) + '.produce') else: step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') # add add_hyperparameter for hyper in hyperparameters.keys(): # print(hyper, hyperparameters[hyper], type(hyperparameters[hyper])) hyper_value = hyperparameters[hyper] step.add_hyperparameter(name=hyper, argument_type=ArgumentType.VALUE, data=hyper_value) step.add_output('produce') pipeline_description.add_step(step) # print('\n') # Output to json data = pipeline_description.to_json() with open('example_pipeline.json', 'w') as f: f.write(data) print(data) # yaml = pipeline_description.to_yaml() # with open('example_pipeline.yml', 'w') as f: # f.write(yaml) # print(yaml) sys.stdout.flush() sys.stdout = default_stdout
step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.column_parser.Common')) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') step_2.add_output('produce') pipeline_description.add_step(step_2) # Step 3 imputer -> imputes null values null values based on mean of column step_3 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_cleaning.imputer.SKlearn')) step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='replace') step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) step_3.add_output('produce') pipeline_description.add_step(step_3) # Step 4: DISTIL/NK Storc primitive -> unsupervised clustering of records with a label step_4 = PrimitiveStep( primitive=index.get_primitive('d3m.primitives.clustering.hdbscan.Hdbscan')) step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') step_4.add_hyperparameter(name='cluster_selection_method', argument_type=ArgumentType.VALUE,
step_0 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.denormalize.Common')) step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step_0.add_output('produce') pipeline_description.add_step(step_0) # Step 1: DISTIL/NK Storc primitive step_1 = PrimitiveStep( primitive=index.get_primitive('d3m.primitives.clustering.k_means.Sloth')) step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') step_1.add_hyperparameter(name='nclusters', argument_type=ArgumentType.VALUE, data=10) step_1.add_hyperparameter(name='long_format', argument_type=ArgumentType.VALUE, data=True) step_1.add_output('produce') step_1.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') pipeline_description.add_step(step_1) # Step 2: column_parser step_2 = PrimitiveStep(primitive=index.get_primitive( 'd3m.primitives.data_transformation.column_parser.DataFrameCommon')) step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER,
def create_pipeline(metric: str) -> Pipeline: previous_step = 0 input_val = 'steps.{}.produce' # create the basic pipeline qa_pipeline = Pipeline(context=PipelineContext.TESTING) qa_pipeline.add_input(name='inputs') # Denormalize so that we have a single dataframe in the dataset step = PrimitiveStep( primitive_description=DenormalizePrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') step.add_output('produce') qa_pipeline.add_step(step) # Extract dataframe from dataset step = PrimitiveStep( primitive_description=DatasetToDataFramePrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=input_val.format(previous_step)) step.add_output('produce') qa_pipeline.add_step(step) previous_step += 1 # Parse columns. step = PrimitiveStep( primitive_description=ColumnParserPrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=input_val.format(previous_step)) step.add_output('produce') semantic_types = ( 'http://schema.org/Boolean', 'http://schema.org/Integer', 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/FloatVector') step.add_hyperparameter('parse_semantic_types', ArgumentType.VALUE, semantic_types) qa_pipeline.add_step(step) previous_step += 1 parse_step = previous_step # Extract attributes step = PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive.metadata. query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=input_val.format(parse_step)) step.add_output('produce') step.add_hyperparameter( 'semantic_types', ArgumentType.VALUE, ('https://metadata.datadrivendiscovery.org/types/Attribute', )) qa_pipeline.add_step(step) previous_step += 1 attributes_step = previous_step # Extract targets step = PrimitiveStep( primitive_description=ExtractColumnsBySemanticTypesPrimitive.metadata. query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=input_val.format(parse_step)) step.add_output('produce') target_types = ( 'https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/TrueTarget') step.add_hyperparameter('semantic_types', ArgumentType.VALUE, target_types) qa_pipeline.add_step(step) previous_step += 1 target_step = previous_step # Generates a bert pair classification model. step = PrimitiveStep( primitive_description=BertPairClassificationPrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=input_val.format(attributes_step)) step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=input_val.format(target_step)) step.add_output('produce') step.add_hyperparameter('metric', ArgumentType.VALUE, metric) step.add_hyperparameter('doc_col_0', ArgumentType.VALUE, 1) step.add_hyperparameter('doc_col_1', ArgumentType.VALUE, 3) qa_pipeline.add_step(step) previous_step += 1 # convert predictions to expected format step = PrimitiveStep( primitive_description=ConstructPredictionsPrimitive.metadata.query()) step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=input_val.format(previous_step)) step.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference=input_val.format(parse_step)) step.add_output('produce') qa_pipeline.add_step(step) previous_step += 1 # Adding output step to the pipeline qa_pipeline.add_output(name='output', data_reference=input_val.format(previous_step)) return qa_pipeline