Ejemplo n.º 1
0
def _word_tfidf_dist(documents):
    words_tfidf = {}
    
    if len(documents) > 0: 
        if _check_is_sentence(documents): #if document contains only 1 or 2 or 3 chars --> acronyms
            try:
                text_analyzer = Vectorizer(ngram_range=(1,2),max_features=50)
                matrix = text_analyzer.fit_transform(documents).todense()
                
                for vocabulary in text_analyzer.vocabulary_.items():
                    word = vocabulary[0]
                    indice = vocabulary[1]
                    words_tfidf[word] = score_tfidf_freq(matrix[:,indice]) 
            except ValueError:
                return {}
        else:
            return _freqdist(documents)
    return words_tfidf
Ejemplo n.º 2
0
def vectorize_videos(fpath, use_idf=False):
    '''
    Converts a YouTube tag file to a sparse matrix pondered. We can assign
    weights based on IDF if specified.
    
    Arguments
    ---------
    fpath: a path to a file 
        Each line is a song, tags are separated by space
    use_idf: bool (optinal, defaults to True)
        Indicates whether to use IDF.
    bottom_filter: float (defaults to 0.005, half of one percent)
        Minimum probability for tags to be considered useful
    '''
    #Vectorizes to TF-IDF
    vectorizer = Vectorizer(analyzer=NoopAnalyzer(), use_idf = use_idf)
    sparse_matrix = vectorizer.fit_transform(clean_up(fpath, bottom_filter=0))
    vocabulary = vectorizer.vocabulary
    return sparse_matrix, vocabulary
Ejemplo n.º 3
0
def vectorize_videos(fpath, use_idf=False):
    '''
    Converts a YouTube tag file to a sparse matrix pondered. We can assign
    weights based on IDF if specified.
    
    Arguments
    ---------
    fpath: a path to a file 
        Each line is a song, tags are separated by space
    use_idf: bool (optinal, defaults to True)
        Indicates whether to use IDF.
    bottom_filter: float (defaults to 0.005, half of one percent)
        Minimum probability for tags to be considered useful
    '''
    #Vectorizes to TF-IDF
    vectorizer = Vectorizer(analyzer=NoopAnalyzer(), use_idf=use_idf)
    sparse_matrix = vectorizer.fit_transform(clean_up(fpath, bottom_filter=0))
    vocabulary = vectorizer.vocabulary
    return sparse_matrix, vocabulary
Ejemplo n.º 4
0
def get_20newsgroups_data_info_for_categories(categories):
    data = fetch_20newsgroups(subset='all', categories=categories, shuffle=False)
    vectorizer = Vectorizer()
    t0 = time()
    
    tfidf = vectorizer.fit_transform(data.data)
    
    pairwise_similarity = (tfidf * tfidf.T).todense().tolist()
    print "done in %fs" % (time() - t0)
    
    labels = [data.target_names[i] for i in data.target]
    payloads = [os.sep.join(e.split(os.sep)[-3:]) for e in data.filenames]
    
    # Similarity is from Zero to One - so (1-s) gives distance from 0 to 1.
    distances = [[(1-s) for s in row[:col_to+1]]for (col_to, row) in enumerate(pairwise_similarity)]
    
    # Fix the very slight off-ness involved in precision-conversion
    for row in distances:
        row[-1] = 0 
    
    pcd_tuples = zip(payloads, labels, distances)
    
    di = DataInfo.deserialize_pcd_tuples(pcd_tuples)
    return di
Ejemplo n.º 5
0
documents = data_train.data + data_test.data
target_names = set(data_train.target_names + data_test.target_names)

print "%d documents" % len(documents)
print "%d categories" % len(target_names)
print

# split a training set and a test set
labels = np.concatenate((data_train.target, data_test.target))
true_k = np.unique(labels).shape[0]

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(documents)

X = Normalizer(norm="l2", copy=False).transform(X)

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print

###############################################################################
# Now sparse MiniBatchKmeans

mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
                       chunk_size=1000)
print "Clustering sparse data with %s" % str(mbkm)
t0 = time()
mbkm.fit(X)
Ejemplo n.º 6
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"water"]], 1)

        # stop word from the fixed list
        assert_false(u"the" in v.vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false(u"copyright" in v.vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = toarray(t1.fit(counts_train).transform(counts_train))
    assert_equal(len(t1.idf_), len(v1.vocabulary))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary)))

    # test tf-idf with new data
    tfidf_test = toarray(t1.transform(counts_test))
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = toarray(t2.fit(counts_train).transform(counts_train))
    assert_equal(t2.idf_, None)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = Vectorizer(norm='l1')
    tv.tc.max_df = v1.max_df
    tfidf2 = toarray(tv.fit_transform(train_data))
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = toarray(tv.transform(test_data))
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)
Ejemplo n.º 7
0
print 'data loaded'

categories = data_train.target_names  # for case categories == None

print "%d documents (training set)" % len(data_train.data)
print "%d documents (testing set)" % len(data_test.data)
print "%d categories" % len(categories)
print

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform(data_train.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
documents = data_train.data + data_test.data
target_names = set(data_train.target_names + data_test.target_names)

print "%d documents" % len(documents)
print "%d categories" % len(target_names)
print

# split a training set and a test set
labels = np.concatenate((data_train.target, data_test.target))
true_k = np.unique(labels).shape[0]

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(documents)

X = Normalizer(norm="l2", copy=False).transform(X)

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print

###############################################################################
# Now sparse MiniBatchKmeans

mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
                       chunk_size=1000)
print "Clustering sparse data with %s" % str(mbkm)
t0 = time()
mbkm.fit(X)
Ejemplo n.º 9
0
filenames = np.concatenate((data_train.filenames, data_test.filenames))
target_names = set(data_train.target_names + data_test.target_names)

print "%d documents" % len(filenames)
print "%d categories" % len(target_names)
print

# split a training set and a test set
labels = np.concatenate((data_train.target, data_test.target))
true_k = np.unique(labels).shape[0]

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform((open(f).read() for f in filenames))

X = Normalizer(norm="l2", copy=False).transform(X)

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print


###############################################################################
# Now sparse MiniBatchKmeans

print "_" * 80

mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
                       chunk_size=1000, tol=0.0, n_init=1)
print 'data loaded'

categories = data_train.target_names    # for case categories == None

print "%d documents (training set)" % len(data_train.data)
print "%d documents (testing set)" % len(data_test.data)
print "%d categories" % len(categories)
print

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform(data_train.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print

if opts.select_chi2:
    print ("Extracting %d best features by a chi-squared test" %
           opts.select_chi2)
    t0 = time()
Ejemplo n.º 11
0
def main(tcu_fpath):
    data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath)
    
    #We only want accepted data
    data = data[data['Situacao'] == 'Aceito e Habilitado']

    #Get invalid lines
    invalids = invalid(data)
    
    #Transforms descriptions to base strings
    desc_column = data['Descricao']
    des_cmp_column = data['DescricaoComplementar']
    unidade_column = data['UnidadeFornecimento']
    qtd_column = [str(qtd) for qtd in data['Quantidade']]
    
    as_docs = []
    for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column):
        doc = " ".join(as_text)
        as_docs.append(doc)

    #Vectorizes to TF-IDF
    vectorizer = Vectorizer()
    doc_sparse_matrix = vectorizer.fit_transform(as_docs)
    
    #Run K-Means
    num_clusters = 7
    mbkm = MiniBatchKMeans(num_clusters, init = 'random')
    mbkm.fit(doc_sparse_matrix)
    
    #New labels column, replaces both Descricao columns
    labels_column = mbkm.labels_
    
    #Old columns to keep
    chave_column = data['ChavePregao']
    uasg_column = data['UASG']
    pregoeiro_column = data['PregoeiroOficial']
    aceito_column = data['AceitoPara_CNPJ']
    lance_column = data['PeloMenorLance']
    ref_column = data['ValordeReferencia']
    ganho_column = data['GanhoPregao']
    
    #And a new column Superfaturamento
    super_faturamento = np.ndarray(shape=len(labels_column), dtype = 'S12')
    for i, ganho in enumerate(ganho_column):
        if ganho >= -50: #50% vezes o preco é aceito
            super_faturamento[i] = 'OK'
        elif ganho < -50 and ganho > -500: #Mais que isso é super faturado
            super_faturamento[i] = 'Super'
        elif ganho < -500: #Mais que 5x o valor é foda.
            super_faturamento[i] = 'SuperPlus'
    
    for i in xrange(len(labels_column)):
        if i not in invalids:
            print(labels_column[i], end=',')
            print(chave_column[i], end=',')
            print(uasg_column[i], end=',')
            print(pregoeiro_column[i], end=',')
            print(aceito_column[i], end=',')
            print(lance_column[i], end=',')
            print(ref_column[i], end=',')
            print(ganho_column[i], end=',')
            print(super_faturamento[i])
Ejemplo n.º 12
0

input_data = csv.reader(open('descriptions_100.csv','rb'))
dataset_data = []
dataset_target = []
for row in input_data:
    dataset_data.append(row[1])
    dataset_target.append(row[0])

labels = dataset_target
true_k = np.unique(labels).shape[0]

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_df=0.95, max_features=10000)
X = vectorizer.fit_transform(dataset_data)
print X

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape


###############################################################################
# Do the actual clustering

km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,init_size=1000,batch_size=1000, verbose=1)

print "Clustering with %s" % km
t0 = time()
km.fit(X)
print "done in %0.3fs\n" % (time() - t0)
Ejemplo n.º 13
0
def main(tcu_fpath):
    data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath)
    data = data[data['Situacao'] == 'Aceito e Habilitado']
    
    desc_column = data['Descricao']
    des_cmp_column = data['DescricaoComplementar']
    unidade_column = data['UnidadeFornecimento']
    qtd_column = [str(qtd) for qtd in data['Quantidade']]
    
    #Transforms descriptions to base strings
    as_docs = []
    for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column):
        doc = " ".join(as_text)
        as_docs.append(doc)

    #Vectorizes to TF-IDF
    vectorizer = Vectorizer()
    doc_sparse_matrix = vectorizer.fit_transform(as_docs)
    
    #Compute clusters
    inter = {}
    intra = {}
    n_runs = 20
    k_vals = range(2, 16)
    for i in xrange(n_runs):
        for k in k_vals:
            #Each K has n_runs clusterings
            inter_array = inter.setdefault(k, np.zeros(n_runs))
            intra_array = intra.setdefault(k, np.zeros(n_runs))
            
            #Run K-Means
            mbkm = MiniBatchKMeans(k, init = 'random')
            mbkm.fit(doc_sparse_matrix)
            
            centers = mbkm.cluster_centers_
            labels = mbkm.labels_
            
            #Inter distance. We use min because the ideia is to maximize this.
            #Min serves as a penalty for worse case.
            dist_centers = pairwise.euclidean_distances(centers)
            min_dist_between_centers = \
                np.min(dist_centers[dist_centers > 0])
            inter_array[i] = min_dist_between_centers

            #Intra distance
            dist_all_centers = mbkm.transform(doc_sparse_matrix)
            intra_dists = []
            for doc_id, cluster in enumerate(labels):
                dist = dist_all_centers[doc_id, cluster]
                intra_dists.append(dist)
            intra_array[i] = np.mean(intra_dists)
            
            #Prints num elements per cluster
            print('Run %d ; k = %d' %(i, k))
            counter = Counter(labels)
            for cluster, population in counter.items():
                print('\tK = %d; Pop = %d' %(cluster, population))
            print()
    
    x = inter.keys()
    y = []
    c = []
    for k in x:
        div = inter[k] / intra[k]
        y.append(np.mean(div))
        c.append(half_confidence_interval_size(div, 0.90))
    
    #hack for the zero to apper
    x = [0] + x
    y = [0] + y
    c = [0] + c
    
    ax = plt.gca()
    ax.set_yscale('log')
    ax.set_xticks(range(0, 16))
    plt.ylabel('InterCluster/IntraCluster Ratio')
    plt.xlabel('Number of clusters')
    plt.errorbar(x, y, yerr=c, fmt='bo', markersize=8, elinewidth=2)
    plt.show()
Ejemplo n.º 14
0
if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print "MLCOMP_DATASETS_HOME not set; please follow the above instructions"
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp('20news-18828', 'train')
print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read()
                                    for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Loading 20 newsgroups test set... "
news_test = load_mlcomp('20news-18828', 'test')
t0 = time()
print "done in %fs" % (time() - t0)

print "Predicting the labels of the test set..."
print "%d documents" % len(news_test.filenames)
print "%d categories" % len(news_test.target_names)

print "Extracting features from the dataset using the same vectorizer"
Ejemplo n.º 15
0
iter = ShuffleSplit(num_posts, n_iterations=1, test_fraction=0.15, indices=False)
for (iter_no, (train_index, test_index)) in enumerate(iter):
    print 'Iteration no. %d' %(iter_no + 1)
    y_train = np.array([ x for (x, y) in zip(all_data['target'], train_index) if y ])
    y_test  = np.array([ x for (x, y) in zip(all_data['target'], test_index) if y ])
    print 'Sampled %d training and %d test posts' %(len(y_train), len(y_test))

    print "Extracting features from the training dataset using a sparse vectorizer"
    t0 = time()
    title_vectorizer = Vectorizer(
        analyzer=WordNGramAnalyzer(
            charset='utf-8', 
            stop_words=set(['a', 'an', 'and', 'in', 'is', 'of', 'on', 'the', 'to']),
            )
        )
    title_train = title_vectorizer.fit_transform([ x for (x, y) in zip(all_data['title'], train_index) if y ])
    
    domain_vectorizer = extract.SimpleVectorizer()
    domain_train = domain_vectorizer.fit_transform([ x for (x, y) in zip(all_data['domain'], train_index) if y ])
    X_train = title_train
    print "done in %fs" % (time() - t0)
    print "n_samples: %d, n_features: %d" % X_train.shape
    print

    print "Extracting features from the test dataset using the same vectorizer"
    t0 = time()
    title_test = title_vectorizer.transform([ x for (x, y) in zip(all_data['title'], test_index) if y ])
    domain_test = domain_vectorizer.transform([ x for (x, y) in zip(all_data['domain'], test_index) if y ])
    X_test = domain_test
    print "done in %fs" % (time() - t0)
    print "n_samples: %d, n_features: %d" % X_test.shape
Ejemplo n.º 16
0
dataset = fetch_20newsgroups(subset='all',
                             categories=categories,
                             shuffle=True,
                             random_state=42)

print "%d documents" % len(dataset.data)
print "%d categories" % len(dataset.target_names)
print

labels = dataset.target
true_k = np.unique(labels).shape[0]

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_df=0.95, max_features=10000)
X = vectorizer.fit_transform(dataset.data)

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print

###############################################################################
# Do the actual clustering

if opts.minibatch:
    km = MiniBatchKMeans(k=true_k,
                         init='k-means++',
                         n_init=1,
                         init_size=1000,
                         batch_size=1000,
                         verbose=1)
Ejemplo n.º 17
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"water"]], 1)

        # stop word from the fixed list
        assert_false(u"the" in v.vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false(u"copyright" in v.vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = toarray(t1.fit(counts_train).transform(counts_train))
    assert_equal(len(t1.idf_), len(v1.vocabulary))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary)))

    # test tf-idf with new data
    tfidf_test = toarray(t1.transform(counts_test))
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = toarray(t2.fit(counts_train).transform(counts_train))
    assert_equal(t2.idf_, None)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = Vectorizer(norm='l1')
    tv.tc.max_df = v1.max_df
    tfidf2 = toarray(tv.fit_transform(train_data))
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = toarray(tv.transform(test_data))
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)
            'SafeHarbor','Truste', 'Change', 'Location', 'Children', 'Contact', 
            'Process', 'Retention']

# Load data
print "Loading privacy policy dataset for categories:"
print categories if categories else "all"
data_set = load_files('Privacypolicy_balance/raw', categories = categories,
                        shuffle = True, random_state = 42)
print 'data loaded'
print

# Extract features
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(data_set.data)
X = Normalizer(norm="l2", copy=False).transform(X)

y = data_set.target

# feature selection
ch2 = SelectKBest(chi2, k = 1800)
X = ch2.fit_transform(X, y)

X = X.toarray()

n_samples, n_features = X.shape
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % (n_samples, n_features)
print
Ejemplo n.º 19
0
data_train = load_files('./Train', categories = train_category,
                        shuffle = True, random_state = 42)

data_test = load_files('./Test/Unlabeled', categories = test_category,
                        shuffle = True, random_state = 42)
print 'data loaded'
print len(data_train.data)
print len(data_test.data)
print

# Extract features
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()

vectorizer = Vectorizer(max_features=10000)
X_test = vectorizer.fit_transform(data_test.data)
X_test = Normalizer(norm="l2", copy=False).transform(X_test)

X = vectorizer.transform(data_train.data)
X = Normalizer(norm="l2", copy=False).transform(X)

X = X.toarray()
X_test = X_test.toarray()

n_samples, n_features = X.shape
test_samples, test_features = X_test.shape
print "done in %fs" % (time() - t0)
print "Train set - n_samples: %d, n_features: %d" % (n_samples, n_features)
print "Test set  - n_samples: %d, n_features: %d" % (test_samples, test_features)
print