def visit(self, element): try: featureset_df = element.get_dataframe() # collect columns to restore initial column order in the end columns = list(featureset_df) # expand features, aggregate features, group features index_df = featureset_df[self._column] if type(self._column) is list else featureset_df[[self._column]] aggregated_groups = [] for column in featureset_df: if featureset_df[column].dtype == "object": if isinstance(featureset_df[column].iloc[0], (list, tuple, np.ndarray)): group_df = self.expand(featureset_df, column, True) group_idx = pd.concat([index_df, group_df], axis=1) group_idx = group_idx.groupby(self._column).aggregate(self.select_numeric_op()) group_idx[column] = list(group_idx.values) print(group_idx[[column]]) aggregated_groups.append(group_idx[[column]]) print(featureset_df) featureset_df = featureset_df.groupby(self._column).aggregate(self.select_numeric_op()) aggregated_groups.append(featureset_df) featureset_df = pd.concat(aggregated_groups, axis=1) featureset_df = featureset_df.reset_index() print(featureset_df) featureset_df = featureset_df.reindex(columns, axis=1) print(featureset_df) element.set_dataframe(featureset_df) except Exception as error: Util.print_error("Unable to condense Dataframe: " + str(error)) Util.print_detailed_error()
def execute(self, element): try: data_type = element.get_dataframe()[self._column].dtype if data_type == "object": data = element.get_dataframe()[self._column] for x in range(0, 1000): if len(data) <= x: continue if isinstance(data.iloc[x], (list, tuple, np.ndarray)): if len(data.iloc[x]) == 0: continue elif isinstance(data.iloc[x][0], str): return "stringarray" elif isinstance(data.iloc[x][0], int): return "intarray" elif isinstance(data.iloc[x][0], float): return "floatarray" else: return "featurevector_" + str( data.iloc[x][0].dtype) elif isinstance(data.iloc[x], str): return "string" elif isinstance(data_type, str): return "string" else: return str(data_type) except Exception as error: util.print_error("Unable to print data type: " + str(error)) util.print_detailed_error()
def visit(self, element): try: featureset_df = element.get_dataframe() # check if column is single-column or column group feature_vector = False target_column = featureset_df[self._column] if target_column.dtype == "object": if isinstance(target_column.iloc[0], (list, tuple, np.ndarray)): feature_vector = True target_column = self.expand(featureset_df, self._column, True) # TODO no option for feature vector with string features yet (compare numbers) if self._type == "string": target_column.fillna(self._value, inplace=True) if self._type == "number": if feature_vector: for column in target_column: self.fill_number_feature_cells(target_column[column]) featureset_df[self._column] = list( target_column[list(target_column)].values) else: self.fill_number_feature_cells(target_column) element.set_dataframe(featureset_df) except Exception as error: Util.print_error("Unable to Group Features: " + str(error)) Util.print_detailed_error()
def visit(self, model): try: result = model.get_metric() result['AccScore'] = accuracy_score(self._true, self._predict) model.set_metric(result) except Exception as error: Util.print_error("Unable to set estimator of Model") Util.print_error(error)
def visit(self, model): try: estimator = model.get_estimator() prediction = estimator.predict(self._predict) return prediction except Exception as error: Util.print_error("Unable to predict") Util.print_error(error)
def visit(self, model): try: estimator = model.get_estimator() estimator.fit(self._X, self._Y) model.set_estimator(estimator) except Exception as error: Util.print_error("Unable to fit estimator") Util.print_error(error)
def visit(self, featureset): try: featureset.set_dataframe_column( self._column, np.char.join(self._value, featureset.get_column_values(self._column))) except Exception as error: util.print_error("Unable to add value to array") util.print_error(error)
def visit(self, model): try: return cross_validate(self._estimator, model.get_x_train(), model.get_y_train(), cv=self._k_fold) except Exception as error: Util.print_error("Unable to calculate cross validation") Util.print_error(error)
def visit(self, model): try: result = model.get_metric() result['PrecScore'] = precision_score(self._true, self._predict, average=self._average) model.set_metric(result) except Exception as error: Util.print_error("Unable to set estimator of Model") Util.print_error(error)
def visit(self, model): try: if self._return_value: return self.get_estimator(self._estimator) else: model.set_estimator(self.get_estimator(self._estimator)) model.set_estimator_type(self._learning_type) except Exception as error: Util.print_error("Unable to set estimator of Model") Util.print_error(error)
def visit(self, featureset): try: data = featureset.get_featureset() if self._method == "median": data = data.fillna(data.median()) else: data = data.interpolate(method=self._method) featureset.set_featureset(data) except Exception as error: Util.print_error("Unable to mask featureset: " + str(error)) Util.print_detailed_error()
def visit(self, element): try: data = element.get_dataframe() if self._mode == "shuffle": data = data.sample(frac=1) elif self._mode == "column": data = data.sort_values(by=self._column) elif self._mode == "index": data = data.sort_index() element.set_dataframe(data) except Exception as error: Util.print_error("Unable to sort featureset: " + str(error)) Util.print_detailed_error()
def execute(self, element): try: processor = LabelBinarizer(self._neg_label, self._pos_label, self._sparse_output) dataframe = element.get_dataframe() data = processor.fit_transform(dataframe.values) classes = processor.classes_ element.set_classes(classes) dataframe.values = data element.set_dataframe(dataframe) except Exception as error: Util.print_error("Unable to label binarize Dataframe: " + str(error)) Util.print_detailed_error()
def visit(self, featureset): try: _result = [] for text in featureset.get_column_values(self._column): if isinstance(text, list): _preprocessed = [] for word in text: _preprocessed.append(word.upper()) _result.append(_preprocessed) else: _result.append(text.upper()) featureset.set_dataframe_column(self._column, _result) except Exception as error: util.print_error("Unable to add value to array") util.print_error(error)
def visit(self, featureset): try: _result = [] for text in featureset.get_column_values(self._column): if isinstance(text, list): _preprocessed = [] for word in text: _preprocessed.append(sum(len(x) for x in word)) else: _preprocessed = sum(len(x) for x in text) _result.append(_preprocessed) featureset.set_dataframe_column(self._column, np.asarray(list(_result))[:, np.newaxis]) except Exception as error: util.print_error("Unable to create character sum of text") util.print_error(error)
def visit(self, featureset): try: _result = [] for text in featureset.get_column_values(self._column): if isinstance(text, list): _preprocessed = [] for word in text: _preprocessed.append(1 if word else 0) else: _preprocessed = 1 if text else 0 _result.append(_preprocessed) featureset.set_dataframe_column( self._column, np.asarray(list(_result))[:, np.newaxis]) except Exception as error: util.print_error("Unable to create binary column") util.print_error(error)
def visit(self, featureset): try: _result = [] for text in featureset.get_column_values(self._column): if isinstance(text, list): _preprocessed = [] for word in text: _preprocessed.append(self.n_gram(word)) _result.append(_preprocessed) else: _preprocessed = self.n_gram(text) print(_preprocessed) _result.append(_preprocessed) featureset.set_dataframe_column(self._column, _result) except Exception as error: util.print_error("Unable to Create Word NGrams") util.print_error(error)
def visit(self, featureset): try: _result = [] for text in featureset.get_column_values(self._column): if isinstance(text, list): _preprocessed = [] for word in text: _preprocessed.append(self.remove(word)) else: _preprocessed = self.remove(text) _result.append(_preprocessed) _new_result = np.asarray(list(_result))[:, np.newaxis] _new_result = _new_result.reshape( featureset.get_column_values(self._column).shape) featureset.set_dataframe_column(self._column, _new_result) except Exception as error: util.print_error("Unable to tokenize column") util.print_error(error)
def visit(self, element): try: featuresets = {} data = element.get_dataframe() temp_data = [] self.create_split_list(data.shape[0]) for key, value in self._id_split.items(): if self._mode == "sequential": temp_data = data[:value] elif self._mode == "random": temp_data = np.split(data, value) data = data.iloc[value:] featuresets[key] = temp_data return featuresets except Exception as error: Util.print_error( "Unable to split Featureset in multiple Frames: " + str(error)) Util.print_detailed_error()
def visit(self, element): try: featureset = element.get_dataframe() if self._column is None: featureset = featureset.mask(eval(self._condition)) else: feature = featureset[self._column] if feature.dtype == "object": if isinstance(feature.iloc[0], (list, tuple, np.ndarray)): feature = self.expand(pd.DataFrame(feature), self._column) feature = feature.mask(eval(self._condition)) feature = list(feature.values) else: feature = feature.mask(eval(self._condition)) featureset[self._column] = feature element.set_dataframe(featureset) except Exception as error: Util.print_error("Unable to mask featureset: " + str(error)) Util.print_detailed_error()
def visit(self, element): try: featureset_df = element.get_dataframe() new_featureset = pd.DataFrame() if self._sequential is True: temp_dataframe = pd.DataFrame() temp_value = None temp_index = [] for index, row in featureset_df.iterrows(): "Set first indexlist element" if temp_value is None: temp_value = row[self._column] temp_index.append(index) else: "Check if value of column is in current row and add the row to new dataframe" if temp_value != row[self._column]: "Save the value from the selected column" first_value_frame = pd.DataFrame( {self._column: [temp_value]}) "Calculate the value for every column" second_value_frame = self.select_numeric_feature( temp_dataframe) "Transform Series Dataframe to Dataframe and transpose it" second_value_frame = second_value_frame.to_frame( ).transpose() "Add every column to dataframe" for name, value in second_value_frame.iteritems(): first_value_frame[name] = value "Add new row to dataframe" new_featureset = new_featureset.append( first_value_frame, ignore_index=True) temp_dataframe = pd.DataFrame() "Add index to indexlist" temp_index.append(index) temp_value = row[self._column] "Add Row to temporary dataframe" temp_dataframe = temp_dataframe.append( featureset_df.iloc[index]) "Save the value from the selected column" first_value_frame = pd.DataFrame({self._column: [temp_value]}) "Calculate the value for every column" second_value_frame = self.select_numeric_feature( temp_dataframe) "Transform Series Dataframe to Dataframe and transpose it" second_value_frame = second_value_frame.to_frame().transpose() "Add every column to dataframe for last row" for name, value in second_value_frame.iteritems(): first_value_frame[name] = value new_featureset = new_featureset.append(first_value_frame, ignore_index=True) else: accumulate_list = featureset_df[self._column].unique() temp_unique_list = accumulate_list temp_index = [] "Create Index list for unique values" for index, row in featureset_df.iterrows(): for value in temp_unique_list: if value == row[self._column]: temp_unique_list = temp_unique_list[ temp_unique_list != value] temp_index.append(index) break for value in accumulate_list: "Select all rows with value in column" temp_dataframe = featureset_df.loc[featureset_df[ self._column] == value] "Save the value from the selected column" first_value_frame = pd.DataFrame({self._column: [value]}) "Calculate the value for every column" second_value_frame = self.select_numeric_feature( temp_dataframe) "Transform Series Dataframe to Dataframe and transpose it" second_value_frame = second_value_frame.to_frame( ).transpose() "Add every column to dataframe" for name, second_value in second_value_frame.iteritems(): first_value_frame[name] = second_value new_featureset = new_featureset.append(first_value_frame, ignore_index=True) # Update Dataframe index if self._save_index: new_featureset["#index#"] = temp_index new_featureset = new_featureset.set_index("#index#") new_featureset.index.name = None "Reindexcolumns of the new dataframe with the old dataframe" new_featureset = new_featureset.reindex( columns=featureset_df.columns) element.set_dataframe(new_featureset) except Exception as error: Util.print_error("Unable to condense Dataframe: " + str(error)) Util.print_detailed_error()
def visit(self, featureset): try: # TODO: outsource into method "set_tokenizer" (tokenizer as member - no extraction_target required then) tokenizer = None if self._extraction_target == "word": tokenizer = LemmaTokenizer(LanguageProcessor()) elif self._extraction_target == "pos": tokenizer = POSTokenizer(LanguageProcessor()) elif self._extraction_target == "ne_simple": tokenizer = NamedEntityTokenizer(LanguageProcessor()) elif self._extraction_target == "ne_detailed": tokenizer = NamedEntityTokenizer(LanguageProcessor(), detailed=True) elif self._extraction_target.startswith("wordlist"): path = self._extraction_target.split("_")[1] tokenizer = WordlistEntryTokenizer(LanguageProcessor(), wordlist=path) # TODO: outsource into method "set_vectorizer" (vectorizer as member - no measure required then) print(self._ngram) print(self._column) vectorizer = None binary = self._measure == "presence" or self._extraction_type == "presence" if self._ngram is None: if self._measure == "tfidf": vectorizer = TfidfVectorizer(tokenizer=tokenizer) else: # TODO: here it is absolute term-frequency - what about relative? # For ngrams not easy: # - needs to count the amount of n-gram for each document and divide each feature generated from # the ngram-counts of the document by that amount # For named-entities: # - count words inside named entities (not just the amount of NEs) devide by num tokens of doc # ... vectorizer = CountVectorizer(tokenizer=tokenizer, binary=binary) else: if self._measure == "tfidf": vectorizer = TfidfVectorizer(tokenizer=tokenizer, ngram_range=self._ngram) else: vectorizer = CountVectorizer(tokenizer=tokenizer, ngram_range=self._ngram, binary=binary) temp_column = featureset.get_featureset()[self._column] temp_column = temp_column.values new_column = [] "Note: Presence and Count for every(einzeln) feature or for all(alle) feature" if self._extraction_type == "bow" or self._extraction_type == "ngram": # Return Matrix new_column = list(vectorizer.fit_transform(temp_column).toarray()) elif self._extraction_type == "list": # Return String Array analyzer = vectorizer.build_tokenizer() for row in temp_column: print(row) print(analyzer(row)) new_column.append(analyzer(row)) elif self._extraction_type == "presence": # Return Numeric Array analyzer = vectorizer.build_tokenizer() for row in temp_column: new_column.append(1 if len(analyzer(row)) > 0 else 0) # new_column.append(len(analyzer(row)) > 0) elif self._extraction_type == "count": # Return Numeric Array analyzer = vectorizer.build_tokenizer() for row in temp_column: new_column.append(len(analyzer(row))) return new_column except Exception as error: util.print_error("Failed to use Language Processor " + str(error)) util.print_detailed_error()