def test_tifd_vector(): data = main.getDataSet()#['My name is Rohan', 'My name is Sachin', 'rohan'] t = TifdVector(); print t.getFeatureVector_fromset(data)
vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf) import main tweets = main.getDataSet() X = vectorizer.fit_transform(tweets) # X = vectorizer.fit_transform(dataset.data) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) print() if opts.n_components: print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(opts.n_components) lsa = make_pipeline(svd, Normalizer(copy=False))
hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=True, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf) import main tweets = main.getDataSet() X = vectorizer.fit_transform(tweets) # X = vectorizer.fit_transform(dataset.data) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) print() if opts.n_components: print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(opts.n_components) lsa = make_pipeline(svd, Normalizer(copy=False))
stop_words='english', non_negative=True, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf) import main X = vectorizer.fit_transform(main.getDataSet()) # X = vectorizer.fit_transform(dataset.data) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) print() if opts.n_components: print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(opts.n_components) lsa = make_pipeline(svd, Normalizer(copy=False))