def main(): datasets = [ {"city": "beijing", "age": 500, "temperature": 26}, {"city": "shanghai", "age": 550, "temperature": 27}, {"city": "shenzheng", "age": 300, "temperature": 30}, ] dict_vectorizer = DictVectorizer() dv_datasets = dict_vectorizer.fit_transform(datasets) print dv_datasets.toarray() print dict_vectorizer.vocabulary_ print dict_vectorizer.feature_names_ print "-" * 80 #fh_vectorizer = FeatureHasher(n_features=10, input_type="dict") #fh_datasets = fh_vectorizer.fit_transform([{"text": 10, "words": 7}, {"name": 1, "words": 5}, {"gender": 1}]) fh_vectorizer = FeatureHasher(n_features=10, input_type="string") fh_datasets = fh_vectorizer.fit_transform(["Liming love football", "Zhansan likes baseball"]) print fh_datasets.toarray() raw_datasets, _ = Datasets.load_datasets() datasets = [v for v in raw_datasets.data[:10]] count_vectorizer = CountVectorizer(decode_error="ignore") cv_datasets = count_vectorizer.fit_transform(datasets) print count_vectorizer.vocabulary_ tfidf_transformer = TfidfTransformer(smooth_idf=True) tfidft_datasets = tfidf_transformer.fit_transform(cv_datasets) print tfidft_datasets.toarray() print tfidf_transformer.idf_ hash_vectorizer = HashingVectorizer(n_features=100, decode_error="ignore") hv_datasets = hash_vectorizer.fit_transform(datasets) print hv_datasets.toarray().shape
def main(): raw_datasets, _ = Datasets.load_datasets() X, Y = gen_datasets(raw_datasets) vectorizer = CountVectorizer(decode_error="ignore") cv_datasets = vectorizer.fit_transform(X).toarray() clf = ExtraTreesClassifier() clf = clf.fit(cv_datasets, Y) print cv_datasets.shape print clf.feature_importances_ modle = SelectFromModel(clf, prefit=True) X_new = modle.transform(cv_datasets) print X_new.shape binarizer = Binarizer(threshold=1.0) b_datasets = binarizer.fit_transform(cv_datasets) variance_threshold = VarianceThreshold(.8 * (1 - .8)) v_datasets = variance_threshold.fit_transform(b_datasets) print v_datasets.shape