def preprocess(words_file="../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "rb") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "rb") #word_data = cPickle.load(words_file_handler) word_data = pickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = model_selection.train_test_split( word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) # selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform( features_train_transformed).toarray() features_test_transformed = selector.transform( features_test_transformed).toarray() ### info on the data print("no. of Chris training emails:", sum(labels_train)) print("no. of Sara training emails:", len(labels_train) - sum(labels_train)) print("selector percentile (percent of features used):", selector.get_params()['percentile']) return features_train_transformed, features_test_transformed, labels_train, labels_test
class SelectPercentile(FeatureSelectionAlgorithm): r"""Implementation of feature selection using percentile selection of best features according to used score function. Date: 2020 Author: Luka Pečnik License: MIT Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html See Also: * :class:`niaaml.preprocessing.feature_selection.feature_selection_algorithm.FeatureSelectionAlgorithm` """ Name = 'Select Percentile' def __init__(self, **kwargs): r"""Initialize SelectPercentile feature selection algorithm. """ self._params = dict(score_func=ParameterDefinition( [chi2, f_classif, mutual_info_classif]), percentile=ParameterDefinition( MinMax(10, 100), np.uint)) self.__select_percentile = SelectPerc() def set_parameters(self, **kwargs): r"""Set the parameters/arguments of the algorithm. """ self.__select_percentile.set_params(**kwargs) def select_features(self, x, y, **kwargs): r"""Perform the feature selection process. Arguments: x (pandas.core.frame.DataFrame): Array of original features. y (pandas.core.series.Series) Expected classifier results. Returns: numpy.ndarray[bool]: Mask of selected features. """ self.__select_percentile.fit(x, y) return self.__select_percentile.get_support() def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return FeatureSelectionAlgorithm.to_string(self).format( name=self.Name, args=self._parameters_to_string( self.__select_percentile.get_params()))