def _produce_annotations(self, inputs: Inputs) -> Outputs: """ generates dataframe with semantic type classifications and classification probabilities for each column of original dataframe Arguments: inputs {Inputs} -- D3M dataframe Returns: Outputs -- dataframe with two columns: "semantic type classifications" and "probabilities" Each row represents a column in the original dataframe. The column "semantic type classifications" contains a list of all semantic type labels and the column "probabilities" contains a list of the model's confidence in assigning each respective semantic type label """ # load model checkpoint checkpoint_dir = (self._volumes["simon_models_1"] + "/simon_models_1/pretrained_models/") if self.hyperparams["statistical_classification"]: execution_config = "Base.pkl" category_list = "/Categories.txt" else: execution_config = "Base_stat_geo.pkl" category_list = "/Categories_base_stat_geo.txt" with open( self._volumes["simon_models_1"] + "/simon_models_1" + category_list, "r") as f: Categories = f.read().splitlines() # create model object Classifier = Simon(encoder={}) config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config["encoder"] checkpoint = config["checkpoint"] model = Classifier.generate_model(20, self.hyperparams["max_rows"], len(Categories)) Classifier.load_weights(checkpoint, None, model, checkpoint_dir) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"]) # prepare data and make predictions frame = inputs.copy() prepped_data = encoder.encodeDataFrame(frame) preds = model.predict_on_batch(tf.constant(prepped_data)) logger.debug('------------Reverse label encoding------------') decoded_preds = encoder.reverse_label_encode( preds, self.hyperparams["p_threshold"]) # apply statistical / ordinal classification if desired if self.hyperparams["statistical_classification"]: logger.debug( "Beginning Guessing categorical/ordinal classifications...") raw_data = frame.values guesses = [ guess(raw_data[:, i], for_types="category") for i in np.arange(raw_data.shape[1]) ] # probability of rule-based statistical / ordinal classifications = min probability of existing classifications for i, g in enumerate(guesses): if g[0] == "category": if len(decoded_preds[1][i]) == 0: guess_prob = self.hyperparams['p_threshold'] else: guess_prob = min(decoded_preds[1][i]) decoded_preds[0][i] += ("categorical", ) decoded_preds[1][i].append(guess_prob) if (("int" in decoded_preds[1][i]) or ("float" in decoded_preds[1][i]) or ("datetime" in decoded_preds[1][i])): decoded_preds[0][i] += ("ordinal", ) decoded_preds[1][i].append(guess_prob) logger.debug("Done with statistical variable guessing") # clear tf session, remove unnecessary files Classifier.clear_session() os.remove('unencoded_chars.json') out_df = pd.DataFrame.from_records(list(decoded_preds)).T out_df.columns = ["semantic types", "probabilities"] return out_df