Example #1
0
def _word_tfidf_dist(documents):
    words_tfidf = {}
    
    if len(documents) > 0: 
        if _check_is_sentence(documents): #if document contains only 1 or 2 or 3 chars --> acronyms
            try:
                text_analyzer = Vectorizer(ngram_range=(1,2),max_features=50)
                matrix = text_analyzer.fit_transform(documents).todense()
                
                for vocabulary in text_analyzer.vocabulary_.items():
                    word = vocabulary[0]
                    indice = vocabulary[1]
                    words_tfidf[word] = score_tfidf_freq(matrix[:,indice]) 
            except ValueError:
                return {}
        else:
            return _freqdist(documents)
    return words_tfidf
Example #2
0
def vectorize_videos(fpath, use_idf=False):
    '''
    Converts a YouTube tag file to a sparse matrix pondered. We can assign
    weights based on IDF if specified.
    
    Arguments
    ---------
    fpath: a path to a file 
        Each line is a song, tags are separated by space
    use_idf: bool (optinal, defaults to True)
        Indicates whether to use IDF.
    bottom_filter: float (defaults to 0.005, half of one percent)
        Minimum probability for tags to be considered useful
    '''
    #Vectorizes to TF-IDF
    vectorizer = Vectorizer(analyzer=NoopAnalyzer(), use_idf=use_idf)
    sparse_matrix = vectorizer.fit_transform(clean_up(fpath, bottom_filter=0))
    vocabulary = vectorizer.vocabulary
    return sparse_matrix, vocabulary
Example #3
0
def test_vectorizer_inverse_transform():
    # raw documents
    data = ALL_FOOD_DOCS
    for vectorizer in (Vectorizer(), CountVectorizer()):
        transformed_data = vectorizer.fit_transform(data)
        inversed_data = vectorizer.inverse_transform(transformed_data)
        for i, doc in enumerate(data):
            data_vec = np.sort(np.unique(vectorizer.analyzer.analyze(data[0])))
            inversed_data_vec = np.sort(np.unique(inversed_data[0]))
            assert ((data_vec == inversed_data_vec).all())
Example #4
0
def vectorize_videos(fpath, use_idf=False):
    '''
    Converts a YouTube tag file to a sparse matrix pondered. We can assign
    weights based on IDF if specified.
    
    Arguments
    ---------
    fpath: a path to a file 
        Each line is a song, tags are separated by space
    use_idf: bool (optinal, defaults to True)
        Indicates whether to use IDF.
    bottom_filter: float (defaults to 0.005, half of one percent)
        Minimum probability for tags to be considered useful
    '''
    #Vectorizes to TF-IDF
    vectorizer = Vectorizer(analyzer=NoopAnalyzer(), use_idf = use_idf)
    sparse_matrix = vectorizer.fit_transform(clean_up(fpath, bottom_filter=0))
    vocabulary = vectorizer.vocabulary
    return sparse_matrix, vocabulary
Example #5
0
 def __init__(self, data, n_train):
   try:
     self.i_body = data.features.index('body')
   except ValueError:
     raise ValueError('please include body')
   comments, scores = it.tee(it.islice(data.get_pairs(), 0, n_train))
   comments = np.array(list(f[self.i_body] for (i, s, f) in comments))
   scores = np.array(list(s for (i, s, f) in scores))
   self.vectorizer = Vectorizer(ngram_range=(1,2))
   X = self.vectorizer.fit_transform(comments)
   self.model = LinearRegression(normalize=True).fit(X, scores)
 def __init__(self, data, n_train):
     try:
         self.i_body = data.features.index('body')
     except ValueError:
         raise ValueError('please include body')
     comments, scores = it.tee(it.islice(data.get_pairs(), 0, n_train))
     comments = np.array(list(f[self.i_body] for (i, s, f) in comments))
     scores = np.array(list((s for (i, s, f) in scores)))
     self.vectorizer = Vectorizer()
     X = self.vectorizer.fit_transform(comments)
     self.model = Perc(max_iter=100, tol=None)
     self.model.fit(X, scores)
Example #7
0
def main(args):
    scorer = make_scorer(my_scorer)
    vectorizer = Vectorizer()

    early_stopping = EarlyStopping(monitor='val_loss', patience=1)
    for cat1ind in range(len(cats)-1):
        cat1 = cats[cat1ind]
        for cat2ind in range(cat1ind+1,len(cats)):
            cat2 = cats[cat2ind]
            subcats = [cat1, cat2]
            newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=subcats)
            vectors = vectorizer.fit_transform(newsgroups_train.data)
            #vectors = vectors.toarray()
            scaler = StandardScaler(with_mean=False)
            print(scaler.fit(vectors))
            ## Doesn't seem to matter
            scaled_vectors = scaler.transform(vectors)
            ## Put targets in the range -1 to 1 instead of 0/1
            binary_targets = newsgroups_train.target
            class_targets = newsgroups_train.target * 2 - 1
            cat_targets = to_categorical(binary_targets)
            #targets = newsgroups_train.target * 2 - 1

            print("Classifying %s vs. %s" % (cat1, cat2))

            ## Get NN performance
            print("Classifying with svm-like (no hidden layer) neural network:")
            #model = get_model(vectors.shape[1])
            sp_model = KerasClassifier(build_fn=get_svmlike_model,
                input_dims=vectors.shape[1],
                l2_weight=0.01,
                epochs=50,
                validation_split=0.2,
                batch_size=32)

            #score = np.average(cross_val_score(sp_model, vectors.toarray(), newsgroups_train.target, scoring=scorer, n_jobs=1, fit_params=dict(verbose=1, callbacks=[early_stopping])))
            param_grid={'l2_weight':[0.001], 'lr':[0.1]}
            clf = GridSearchCV(sp_model, param_grid, scoring=scorer)
            clf.fit(vectors.toarray(), class_targets)
            print("\nScore of nn cross-validation=%f with parameters=%s" % (clf.best_score_, clf.best_params_))

            ####################################################################
            # Below here is the actual svm
            ####################################################################
            print("Classifying with linear svm:")
            max_score = max_c = 0
            params = {'C':[0.01, 0.1, 1.0, 10.0, 100]}
            svc = svm.LinearSVC()
            clf = GridSearchCV(svc, params, scoring=scorer)
            clf.fit(vectors, binary_targets)
            print("Best SVM performance was %f with c=%f" % (clf.best_score_, clf.best_params_['C']))

            sys.exit(-1)
def tfidf(corpus, r=(1, 1), midf=3, madf=0.7, feats=10000, sublinear=True):
    vectorizer = Vectorizer(min_df=midf,
                            max_df=madf,
                            ngram_range=r,
                            max_features=feats,
                            stop_words='english',
                            sublinear_tf=sublinear)
    X = vectorizer.fit_transform(corpus)

    print("n_samples: %d, n_features: %d" % X.shape)

    return X
Example #9
0
    def __init__(self, data, n_train):
        try:
            self.i_body = data.features.index('body')
        except ValueError:
            raise ValueError('please include body')

        comments, scores = it.tee(it.islice(data.get_pairs(), 0, n_train))
        comments = np.array(list(f[self.i_body] for (i, s, f) in comments))
        scores = np.array(list((s for (i, s, f) in scores)))
        self.vectorizer = Vectorizer()
        X = self.vectorizer.fit_transform(comments)
        self.model = LogisticRegression(solver='lbfgs').fit(X, scores)
Example #10
0
def test_vectorizer_inverse_transform():
    # raw documents
    data = ALL_FOOD_DOCS
    for vectorizer in (Vectorizer(), CountVectorizer()):
        transformed_data = vectorizer.fit_transform(data)
        inversed_data = vectorizer.inverse_transform(transformed_data)
        for i, doc in enumerate(data):
            data_vec = np.sort(np.unique(vectorizer.analyzer.analyze(data[0])))
            inversed_data_vec = np.sort(np.unique(inversed_data[0]))
            assert ((data_vec == inversed_data_vec).all())
    # Test that inverse_transform also works with numpy arrays
    transformed_data = np.asarray(transformed_data.todense())
    assert (vectorizer.inverse_transform(transformed_data), inversed_data)
Example #11
0
def train_classifier(texts, y):
    '''
    Here is a perfect example of the "feel it ... func it" philosophy:

    The pype call uses the function arguments and function body to specify 
    three variables, texts, a list of strings, y, a list of floats, and vectorizer,
    a scikit-learn object that vectorizes text.  This reiterates the adivce that you
    should use the function body and function arguments to declare your scope,
    whenever you can.  

    Line-by-line, here we go:

    {'vectorizer':vectorizer.fit,
     'X':vectorizer.transform},

    We build a dict, the first element of which is the fit vectorizer.  Luckily, the
    'fit' function returns an instance of the trained vectorizer, so we do not need to
    use _do.  This vectorizer is then assigned to 'vectorizer'.  Because iterating
    through dictionaries in Python3.6 preserves the order of the keys in which they 
    were declared, we can apply the fit function to the vectorizer on the texts, 
    assign that to the 'vectorizer' key.  We need this instance of the vectorizer to
    run the classifier for unknown texts.

    After this, we apply the 'transform' to convert the texts into a training matrix
    keyed by 'X', whose rows are texts and whose columns are words. 

    _a('classifier',(Classifier().fit,_['X'],y)),

    Finally, we can build a classifier.  _a, or _assoc, means we are adding a 
    key-value pair to the previous dictionary.  This will be a new instance of our
    Classifier, which is trained through the fit function on the text-word matrix 'X'
    and the labels vector y.

    _d('X'),

    Since we don't need the X matrix anymore, we delete it from the returned JSON,
    which now only contains 'vectorizer' and 'classifier', the two things we will
    need to classify unknown texts.
    '''
    vectorizer = Vectorizer()

    return p(
        texts,
        {
            'vectorizer': vectorizer.fit,
            'X': vectorizer.transform
        },
        _a('classifier', (Classifier().fit, _['X'], y)),
        _d('X'),
    )
Example #12
0
def get_20newsgroups_data_info_for_categories(categories):
    data = fetch_20newsgroups(subset='all', categories=categories, shuffle=False)
    vectorizer = Vectorizer()
    t0 = time()
    
    tfidf = vectorizer.fit_transform(data.data)
    
    pairwise_similarity = (tfidf * tfidf.T).todense().tolist()
    print "done in %fs" % (time() - t0)
    
    labels = [data.target_names[i] for i in data.target]
    payloads = [os.sep.join(e.split(os.sep)[-3:]) for e in data.filenames]
    
    # Similarity is from Zero to One - so (1-s) gives distance from 0 to 1.
    distances = [[(1-s) for s in row[:col_to+1]]for (col_to, row) in enumerate(pairwise_similarity)]
    
    # Fix the very slight off-ness involved in precision-conversion
    for row in distances:
        row[-1] = 0 
    
    pcd_tuples = zip(payloads, labels, distances)
    
    di = DataInfo.deserialize_pcd_tuples(pcd_tuples)
    return di
Example #13
0
def test_vectorizer_inverse_transform():
    # raw documents
    data = ALL_FOOD_DOCS
    for vectorizer in (Vectorizer(), CountVectorizer()):
        transformed_data = vectorizer.fit_transform(data)
        inversed_data = vectorizer.inverse_transform(transformed_data)
        for i, doc in enumerate(data):
            terms = np.sort(np.unique(vectorizer.analyzer.analyze(doc)))
            inversed_terms = np.sort(np.unique(inversed_data[i]))
            assert_array_equal(terms, inversed_terms)

    # Test that inverse_transform also works with numpy arrays
    transformed_data = transformed_data.toarray()
    inversed_data2 = vectorizer.inverse_transform(transformed_data)
    for terms, terms2 in zip(inversed_data, inversed_data2):
        assert_array_equal(terms, terms2)
Example #14
0
def make_vectorizer(top_words):
    return Vectorizer(

        # Split based on words
        analyzer="word",

        # Cap features
        max_features=top_words,

        # Only take words that appear in 2 or more headlines
        # (deciding based on headline-unique
        # words is probably over-fitting)
        min_df=2,

        # Make the math simple for computing
        # probabilities / info-gain (word inclusion is either-or)
        # Shouldn't have a significant impact because very few
        # headlines contain have duplicate words
        binary=True)
def main(args):
    scorer = make_scorer(accuracy_score)
    vectorizer = Vectorizer()
    epochs = 50
    valid_pct = 0.2

    for cat1ind in range(len(cats) - 1):
        cat1 = cats[cat1ind]
        for cat2ind in range(cat1ind + 1, len(cats)):
            cat2 = cats[cat2ind]
            print("Classifying %s vs. %s" % (cat1, cat2))
            subcats = [cat1, cat2]
            newsgroups_train = fetch_20newsgroups(subset='train',
                                                  remove=('headers', 'footers',
                                                          'quotes'),
                                                  categories=subcats)
            vectors = vectorizer.fit_transform(newsgroups_train.data)

            #vectors = vectors.toarray()
            scaler = StandardScaler(with_mean=False)
            scaler.fit(vectors)
            ## Doesn't seem to matter
            scaled_vectors = scaler.transform(vectors)
            ## Put targets in the range -1 to 1 instead of 0/1
            binary_targets = newsgroups_train.target
            class_targets = newsgroups_train.target * 2 - 1
            ####################################################################
            # Below here is the actual svm
            ####################################################################
            max_score = max_c = 0
            params = {'C': [0.01, 0.1, 1.0, 10.0, 100]}
            svc = svm.LinearSVC()
            clf = GridSearchCV(svc, params, scoring=scorer)
            clf.fit(vectors, binary_targets)
            print("Best SVM performance was acc=%f with c=%f" %
                  (clf.best_score_, clf.best_params_['C']))
Example #16
0
def test_pickle():
    for obj in (CountVectorizer(), TfidfTransformer(), Vectorizer()):
        s = pickle.dumps(obj)
        assert_equal(type(pickle.loads(s)), obj.__class__)
Example #17
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"water"]], 1)

        # stop word from the fixed list
        assert_false(u"the" in v.vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false(u"copyright" in v.vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = toarray(t1.fit(counts_train).transform(counts_train))
    assert_equal(len(t1.idf_), len(v1.vocabulary))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary)))

    # test tf-idf with new data
    tfidf_test = toarray(t1.transform(counts_test))
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = toarray(t2.fit(counts_train).transform(counts_train))
    assert_equal(t2.idf_, None)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = Vectorizer(norm='l1')
    tv.tc.max_df = v1.max_df
    tfidf2 = toarray(tv.fit_transform(train_data))
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = toarray(tv.transform(test_data))
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)
                               shuffle=True, random_state=42)

filenames = np.concatenate((data_train.filenames, data_test.filenames))
target_names = set(data_train.target_names + data_test.target_names)

print "%d documents" % len(filenames)
print "%d categories" % len(target_names)
print

# split a training set and a test set
labels = np.concatenate((data_train.target, data_test.target))
true_k = np.unique(labels).shape[0]

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform((open(f).read() for f in filenames))

X = Normalizer(norm="l2", copy=False).transform(X)

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print


###############################################################################
# Now sparse MiniBatchKmeans

print "_" * 80

mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
                               shuffle=True, random_state=42)

documents = data_train.data + data_test.data
target_names = set(data_train.target_names + data_test.target_names)

print "%d documents" % len(documents)
print "%d categories" % len(target_names)
print

# split a training set and a test set
labels = np.concatenate((data_train.target, data_test.target))
true_k = np.unique(labels).shape[0]

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(documents)

X = Normalizer(norm="l2", copy=False).transform(X)

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print

###############################################################################
# Now sparse MiniBatchKmeans

mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
                       chunk_size=1000)
print "Clustering sparse data with %s" % str(mbkm)
t0 = time()
Example #20
0
def main(tcu_fpath):
    data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath)
    data = data[data['Situacao'] == 'Aceito e Habilitado']
    
    desc_column = data['Descricao']
    des_cmp_column = data['DescricaoComplementar']
    unidade_column = data['UnidadeFornecimento']
    qtd_column = [str(qtd) for qtd in data['Quantidade']]
    
    #Transforms descriptions to base strings
    as_docs = []
    for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column):
        doc = " ".join(as_text)
        as_docs.append(doc)

    #Vectorizes to TF-IDF
    vectorizer = Vectorizer()
    doc_sparse_matrix = vectorizer.fit_transform(as_docs)
    
    #Compute clusters
    inter = {}
    intra = {}
    n_runs = 20
    k_vals = range(2, 16)
    for i in xrange(n_runs):
        for k in k_vals:
            #Each K has n_runs clusterings
            inter_array = inter.setdefault(k, np.zeros(n_runs))
            intra_array = intra.setdefault(k, np.zeros(n_runs))
            
            #Run K-Means
            mbkm = MiniBatchKMeans(k, init = 'random')
            mbkm.fit(doc_sparse_matrix)
            
            centers = mbkm.cluster_centers_
            labels = mbkm.labels_
            
            #Inter distance. We use min because the ideia is to maximize this.
            #Min serves as a penalty for worse case.
            dist_centers = pairwise.euclidean_distances(centers)
            min_dist_between_centers = \
                np.min(dist_centers[dist_centers > 0])
            inter_array[i] = min_dist_between_centers

            #Intra distance
            dist_all_centers = mbkm.transform(doc_sparse_matrix)
            intra_dists = []
            for doc_id, cluster in enumerate(labels):
                dist = dist_all_centers[doc_id, cluster]
                intra_dists.append(dist)
            intra_array[i] = np.mean(intra_dists)
            
            #Prints num elements per cluster
            print('Run %d ; k = %d' %(i, k))
            counter = Counter(labels)
            for cluster, population in counter.items():
                print('\tK = %d; Pop = %d' %(cluster, population))
            print()
    
    x = inter.keys()
    y = []
    c = []
    for k in x:
        div = inter[k] / intra[k]
        y.append(np.mean(div))
        c.append(half_confidence_interval_size(div, 0.90))
    
    #hack for the zero to apper
    x = [0] + x
    y = [0] + y
    c = [0] + c
    
    ax = plt.gca()
    ax.set_yscale('log')
    ax.set_xticks(range(0, 16))
    plt.ylabel('InterCluster/IntraCluster Ratio')
    plt.xlabel('Number of clusters')
    plt.errorbar(x, y, yerr=c, fmt='bo', markersize=8, elinewidth=2)
    plt.show()
Example #21
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"water"]], 1)

        # stop word from the fixed list
        assert_false(u"the" in v.vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false(u"copyright" in v.vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = toarray(t1.fit(counts_train).transform(counts_train))
    assert_equal(len(t1.idf_), len(v1.vocabulary))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary)))

    # test tf-idf with new data
    tfidf_test = toarray(t1.transform(counts_test))
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = toarray(t2.fit(counts_train).transform(counts_train))
    assert_equal(t2.idf_, None)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = Vectorizer(norm='l1')
    tv.tc.max_df = v1.max_df
    tfidf2 = toarray(tv.fit_transform(train_data))
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = toarray(tv.transform(test_data))
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)
Example #22
0
 def __init__(self):
     self.__morph = MorphAnalyzer()
     self.__tokenizer = RegexpTokenizer(r'\w+')
     self.__vectorizer = Vectorizer()
                              shuffle=True, random_state=42)
print 'data loaded'

categories = data_train.target_names    # for case categories == None

print "%d documents (training set)" % len(data_train.data)
print "%d documents (testing set)" % len(data_test.data)
print "%d categories" % len(categories)
print

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(sublinear_tf=True, max_df=0.5)
X_train = vectorizer.fit_transform(data_train.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print

if opts.select_chi2:
    print ("Extracting %d best features by a chi-squared test" %
           opts.select_chi2)
Example #24
0
print categories if categories else "all"
data_set = load_files('Privacypolicy/raw', categories = categories,
                        shuffle = True, random_state = 42)
print 'data loaded'
# print "%d documents" % len(data_set.data)
# print "%d categories" % len(data_set.target_names)
print

# load unlabeled data
data_set_unlabel = load_files('Privacypolicy/unlabeled', shuffle = True, random_state = 30)


# Extract features
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(data_set.data)
X = Normalizer(norm="l2", copy=False).transform(X)
X = X.toarray()

X_unlabel = vectorizer.transform(data_set_unlabel.data)
X_unlabel = X_unlabel.toarray()

y = data_set.target

n_samples, n_features = X.shape
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % (n_samples, n_features)
print

Example #25
0
    sys.exit(1)


input_data = csv.reader(open('descriptions_100.csv','rb'))
dataset_data = []
dataset_target = []
for row in input_data:
    dataset_data.append(row[1])
    dataset_target.append(row[0])

labels = dataset_target
true_k = np.unique(labels).shape[0]

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_df=0.95, max_features=10000)
X = vectorizer.fit_transform(dataset_data)
print X

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape


###############################################################################
# Do the actual clustering

km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,init_size=1000,batch_size=1000, verbose=1)

print "Clustering with %s" % km
t0 = time()
km.fit(X)
Example #26
0
def main(args):
    vectorizer = Vectorizer()
    epochs = 500
    valid_pct = 0.2
    default_lr = 10.0
    batch_size = 64
    MAX_TRIES = 5

    if torch.cuda.is_available():
        print("CUDA found so processing will be done on the GPU")

    for cat1ind in range(len(cats) - 1):
        cat1 = cats[cat1ind]
        for cat2ind in range(cat1ind + 1, len(cats)):
            cat2 = cats[cat2ind]
            subcats = [cat1, cat2]
            print("Classifying %s vs. %s" % (cat1, cat2))

            newsgroups_train = fetch_20newsgroups(subset='train',
                                                  remove=('headers', 'footers',
                                                          'quotes'),
                                                  categories=subcats)
            vectors = vectorizer.fit_transform(newsgroups_train.data)

            #vectors = vectors.toarray()
            scaler = StandardScaler(with_mean=False)
            scaler.fit(vectors)
            ## Doesn't seem to matter
            scaled_vectors = scaler.transform(vectors)

            train_X_tensor = Tensor(scaled_vectors.toarray())
            if torch.cuda.is_available():
                pyt_data = TensorDataset(train_X_tensor.cuda(),
                                         train_X_tensor.cuda())
            else:
                pyt_data = TensorDataset(train_X_tensor, train_X_tensor)

            end_train_range = int((1 - valid_pct) * vectors.shape[0])
            iterations = 0
            valid_X = pyt_data[end_train_range:][0]
            valid_y = train_X_tensor[end_train_range:]

            my_lr = default_lr

            model = AutoEncoderModel(vectors.shape[1], 2, lr=my_lr)

            for try_num in range(MAX_TRIES):
                nan = False
                for epoch in range(epochs):
                    nan = False
                    epoch_loss = 0
                    for batch_ind in range(end_train_range // batch_size):
                        start_ind = batch_ind * batch_size
                        end_ind = min(start_ind + batch_size, end_train_range)
                        item = pyt_data[start_ind:end_ind]
                        model.train()

                        iterations += 1
                        answer = model(Variable(item[0]))

                        loss = model.criterion(answer, Variable(item[1]))
                        if np.isnan(loss.data[0]):
                            sys.stderr.write(
                                "Training batch %d at epoch %d has nan loss\n"
                                % (batch_ind, epoch))
                            nan = True
                            break

                        epoch_loss += loss
                        loss.backward()
                        model.update()

                        if nan:
                            my_lr /= 2.
                            if try_num + 1 < MAX_TRIES:
                                print(
                                    "Attempting another try (%d) with learning rate halved to %f"
                                    % (try_num + 1, my_lr))
                            else:
                                print(
                                    "Every learning rate resulted in NaN. Quitting."
                                )
                            break

                    ## Compute validation loss:
                    valid_batch = pyt_data[end_train_range:][0]
                    valid_answer = model(Variable(valid_batch))
                    valid_loss = model.criterion(
                        valid_answer, Variable(pyt_data[end_train_range:][1]))
                    #valid_f1 = f1_score(np.sign(valid_answer.data.numpy()), pyt_data[end_train_range:][1].numpy(), pos_label=-1)
                    #valid_acc = accuracy_score(np.sign(valid_answer.cpu().data.numpy()), valid_y.numpy())
                    if epoch % 10 == 0:
                        print(
                            "Epoch %d with training loss %f and validation loss %f"
                            % (epoch, epoch_loss.data[0], valid_loss.data[0]))

                    if nan:
                        break

                if not nan:
                    ## If we got through the epochs without nan then save the model
                    ## and don't take any more tries:

                    break
Example #27
0

if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print "MLCOMP_DATASETS_HOME not set; please follow the above instructions"
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp('20news-18828', 'train')
print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read()
                                    for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Loading 20 newsgroups test set... "
news_test = load_mlcomp('20news-18828', 'test')
t0 = time()
print "done in %fs" % (time() - t0)

print "Predicting the labels of the test set..."
print "%d documents" % len(news_test.filenames)
print "%d categories" % len(news_test.target_names)
Example #28
0
def main(args):
    epochs = 100
    valid_pct = 0.2
    default_lr = 0.01
    default_c = 1.0
    default_decay = 0.0
    hidden_nodes = 128
    batch_size = 512

    if len(args) < 1:
        sys.stderr.write("One required argument: <train directory>\n")
        sys.exit(-1)

    gpu = False
    if torch.cuda.is_available():
        print("CUDA found so processing will be done on the GPU")
        gpu = True

    examples = []
    labels = []
    sys.stderr.write("Loading data\n")
    with open(join(args[0], 'train.csv')) as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            labels.append(int(row[0]))
            examples.append(row[1])

    sys.stderr.write("Transforming data into feature vectors\n")
    scorer = make_scorer(my_scorer)
    vectorizer = Vectorizer(max_features=10000)

    vectors = vectorizer.fit_transform(examples)
    all_y = np.array(labels).astype('float32')
    ## These labels are 1-2 by definition, relabel to +1/-1 for hinge loss:
    binary_targets = all_y - 1
    class_targets = ((all_y - 1) * 2) - 1

    sys.stderr.write("Rescaling data\n")
    scaler = StandardScaler(with_mean=False)
    scaler.fit(vectors)
    ## Doesn't seem to matter
    scaled_vectors = scaler.transform(vectors).astype('float32')
    num_valid_instances = min(1000, int(valid_pct * vectors.shape[0]))
    end_train_range = vectors.shape[0] - num_valid_instances

    ####################################################################
    # Here is the actual svm
    ####################################################################
    # print("Classifying with linear svm:")
    # max_score = max_c = 0
    # params = {'C':[0.001, 0.01, 0.1, 1.0]}
    # svc = svm.LinearSVC()
    # clf = GridSearchCV(svc, params, scoring=scorer)
    # clf.fit(vectors, binary_targets)
    # print("Best SVM performance was with c=%f" % (clf.best_params_['C']))
    # svc = svm.LinearSVC(C=clf.best_params_['C'])
    # svc.fit(scaled_vectors[:end_train_range], binary_targets[:end_train_range])
    # score = svc.score(scaled_vectors[end_train_range:], binary_targets[end_train_range:])
    # print("** SVM score with standard validation set is **%f**" % (score))

    ####################################################################
    # transform the data into pytorch format:
    ####################################################################
    yelp_train_data = YelpPolarityDataset(scaled_vectors[:end_train_range],
                                          class_targets[:end_train_range])
    train_loader = DataLoader(yelp_train_data,
                              shuffle=True,
                              batch_size=batch_size,
                              num_workers=1)
    yelp_valid_data = YelpPolarityDataset(scaled_vectors[end_train_range:],
                                          class_targets[end_train_range:])
    valid_loader = DataLoader(yelp_valid_data)

    ## Get NN performance
    iterations = 0
    my_lr = default_lr

    model = SvmlikeModel(vectors.shape[1],
                         lr=my_lr,
                         c=default_c,
                         decay=default_decay)
    if gpu:
        model.cuda()

    for epoch in range(epochs):
        epoch_loss = 0
        epoch_start = time.time()
        for data in train_loader:
            inputs, labels = data

            if gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels.cuda())
            else:
                inputs, labels = Variable(inputs), Variable(labels)

            model.train()
            answer = model(inputs)
            loss = model.criterion(answer, labels)
            epoch_loss += loss
            loss.backward()
            model.update()

        train_time = time.time() - epoch_start
        #print("Training during epoch %d took %fs\n" % (epoch, train_time))

        valid_acc = 0.0
        valid_loss = 0.0
        for data in valid_loader:
            inputs, labels = data

            if gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels.cuda())
            else:
                inputs, labels = Variable(inputs), Variable(labels)

            valid_answer = model(inputs)
            valid_loss += model.criterion(valid_answer, labels)

            data_proportion = float(inputs.size()[0]) / num_valid_instances
            valid_acc += data_proportion * accuracy_score(
                np.sign(valid_answer.cpu().data.numpy()),
                labels.cpu().data.numpy())

        prev_valid_acc = valid_acc

        end_time = time.time()
        duration = end_time - epoch_start

        if epoch % 1 == 0:
            print(
                "Epoch %d took %ds with training loss %f and validation loss %f, acc=%f"
                % (epoch, duration, epoch_loss.data[0], valid_loss.data[0],
                   prev_valid_acc))
Example #29
0
def main(tcu_fpath):
    data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath)
    
    #We only want accepted data
    data = data[data['Situacao'] == 'Aceito e Habilitado']

    #Get invalid lines
    invalids = invalid(data)
    
    #Transforms descriptions to base strings
    desc_column = data['Descricao']
    des_cmp_column = data['DescricaoComplementar']
    unidade_column = data['UnidadeFornecimento']
    qtd_column = [str(qtd) for qtd in data['Quantidade']]
    
    as_docs = []
    for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column):
        doc = " ".join(as_text)
        as_docs.append(doc)

    #Vectorizes to TF-IDF
    vectorizer = Vectorizer()
    doc_sparse_matrix = vectorizer.fit_transform(as_docs)
    
    #Run K-Means
    num_clusters = 7
    mbkm = MiniBatchKMeans(num_clusters, init = 'random')
    mbkm.fit(doc_sparse_matrix)
    
    #New labels column, replaces both Descricao columns
    labels_column = mbkm.labels_
    
    #Old columns to keep
    chave_column = data['ChavePregao']
    uasg_column = data['UASG']
    pregoeiro_column = data['PregoeiroOficial']
    aceito_column = data['AceitoPara_CNPJ']
    lance_column = data['PeloMenorLance']
    ref_column = data['ValordeReferencia']
    ganho_column = data['GanhoPregao']
    
    #And a new column Superfaturamento
    super_faturamento = np.ndarray(shape=len(labels_column), dtype = 'S12')
    for i, ganho in enumerate(ganho_column):
        if ganho >= -50: #50% vezes o preco é aceito
            super_faturamento[i] = 'OK'
        elif ganho < -50 and ganho > -500: #Mais que isso é super faturado
            super_faturamento[i] = 'Super'
        elif ganho < -500: #Mais que 5x o valor é foda.
            super_faturamento[i] = 'SuperPlus'
    
    for i in xrange(len(labels_column)):
        if i not in invalids:
            print(labels_column[i], end=',')
            print(chave_column[i], end=',')
            print(uasg_column[i], end=',')
            print(pregoeiro_column[i], end=',')
            print(aceito_column[i], end=',')
            print(lance_column[i], end=',')
            print(ref_column[i], end=',')
            print(ganho_column[i], end=',')
            print(super_faturamento[i])
# -*- coding: utf-8 -*-
from sklearn.feature_extraction.text import CountVectorizer as Vectorizer
from sklearn.naive_bayes import BernoulliNB as Classifier
from sklearn.externals import joblib
from utils import *

vectorizer = Vectorizer(min_df=1)
classifier = Classifier()

males_train = read_lines('./data/male_train.txt')
females_train = read_lines('./data/female_train.txt')
train_set = males_train + females_train
labels = [MALE_LABEL] * len(males_train) + [FEMALE_LABEL] * len(females_train)

train_features = vectorizer.fit_transform(train_set)
classifier.fit(train_features, labels)

# save model
joblib.dump(vectorizer, './data/models/gender/vectorizer.pkl')
joblib.dump(classifier, './data/models/gender/classifier.pkl')

test_set = [
    u'Снегирев Роман', u'Ольга Лепорская', u'Саша Сидоров',
    u'Арнольд Шварценеггер', u'Женя Иванова', u'Вера Брежнева',
    u'Алексей Вальков', u'Yakov Malinov', u'Лука Мудищев', u'Пидор Мутный'
]

test_features = vectorizer.transform(test_set)
predictions = classifier.predict(test_features)
print predictions
data_train = fetch_20newsgroups(subset='train', categories=categories,
                               shuffle=True, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories,
                              shuffle=True, random_state=42)

categories = data_train.target_names
print "%d documents (training set)" % len(data_train.data)
print "%d documents (testing set)" % len(data_test.data)
print "%d categories" % len(categories)

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

# Extracting features from the training dataset using a sparse vectorizer
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform(data_train.data)

# Extracting features from the test dataset using the same vectorizer
X_test = vectorizer.transform(data_test.data)

###############################################################################
# Benchmark classifier

# sklearn.naive_bayes.MultinomialNB(alpha=310) gives 84.3%
# LogisticRegression(C=1.9) gives 85.9%
#clf = LogisticRegression(C=1.9, penalty='l2')
# LinearSVC(C=.17) gives 85.9%

from sklearn.svm.sparse import LinearSVC
clf = LinearSVC(C=.17)
Example #32
0
      print i, data_train.target_names[i]


# A primary thought on implementing multi-label classifier
# Aborted later due to functions provided by most classifiers
# Method: Transform y to one-else and use loops to learn binary classifiers
# y_0 = y.copy()
# for i in range(len(y_0)):
#       if y_0[i] == 1:
#             y_0[i] = 2


# Extract features
print "Extracting features from Layer 1 training set using a sparse vectorizer..."
t0 = time()
vectorizer = Vectorizer()
X = vectorizer.fit_transform(data_train.data)
print "Done in %0.3fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print
# to dense array for logistic regression which does not work on sparse
X_den = X.toarray()


# # Feature selection
# select_chi2 = 1000
# print ("Extracting %d best features by a chi-squared test" % select_chi2)
# t0 = time()
# ch2 = SelectKBest(chi2, k = select_chi2)
# X = ch2.fit_transform(X, y)
# print "Done in %fs" % (time() - t0)
                              shuffle=True, random_state=42)
print 'data loaded'

categories = data_train.target_names    # for case categories == None

print "%d documents (training set)" % len(data_train.data)
print "%d documents (testing set)" % len(data_test.data)
print "%d categories" % len(categories)
print

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform(data_train.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print

if opts.select_chi2:
    print ("Extracting %d best features by a chi-squared test" %
           opts.select_chi2)
Example #34
0
#transform the categorical features with one-hot, then concatenate with numeric features
X_cat_feat = pd.get_dummies(trainData[catFeatures]).fillna(0).as_matrix()
test_cat_feat = pd.get_dummies(testData[catFeatures]).fillna(0).as_matrix()
cat_feat_labels = list(pd.get_dummies(trainData[catFeatures]))
print(cat_feat_labels)

# In[ ]:

#transform the text features with SKLearn's CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer as Vectorizer
X_text_feat = np.ndarray(shape=(len(trainData), 0))
test_text_feat = np.ndarray(shape=(len(testData), 0))
text_feat_labels = []

for column in textColumns:
    vectorizer = Vectorizer(binary=True)
    #fit to the training data, transform the test data
    X_column_feat = vectorizer.fit_transform(trainData[column]).toarray()
    test_column_feat = vectorizer.transform(testData[column]).toarray()

    #append to the text features arrays
    X_text_feat = np.append(X_text_feat, X_column_feat, axis=1)
    test_text_feat = np.append(test_text_feat, test_column_feat, axis=1)
    text_feat_labels += vectorizer.get_feature_names()
print(X_text_feat.shape)
print(len(text_feat_labels))

# In[ ]:

#Build the full feature matrices
X_full = np.append(np.append(X_num_feat, X_cat_feat, axis=1),
                              shuffle=True, random_state=42)
print 'data loaded'

categories = data_train.target_names    # for case categories == None

print "%d documents (training set)" % len(data_train.data)
print "%d documents (testing set)" % len(data_test.data)
print "%d categories" % len(categories)
print

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print

if opts.select_chi2:
    print ("Extracting %d best features by a chi-squared test" %
           opts.select_chi2)
Example #36
0
                               random_state=42)
print 'data loaded'

categories = data_train.target_names  # for case categories == None

print "%d documents (training set)" % len(data_train.data)
print "%d documents (testing set)" % len(data_test.data)
print "%d categories" % len(categories)
print

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform(data_train.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
                               random_state=42)
print 'data loaded'

categories = data_train.target_names  # for case categories == None

print "%d documents (training set)" % len(data_train.data)
print "%d documents (testing set)" % len(data_test.data)
print "%d categories" % len(categories)
print

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
from sklearn import metrics


# The training data folder must be passed as first argument
languages_data_folder = sys.argv[1]
dataset = load_files(languages_data_folder)

# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_fraction=0.5)


# TASK: Build a vectorizer that splits strings into sequence of 1 to 3
# of 3 consecutive chars (1-grams, 2-grams and 3-grams of characters)
# with IDF weights disabled (normalized term frequencies only)
vectorizer = Vectorizer(analyzer='char', min_n=1, max_n=3, use_idf=False)

# TASK: Chain the vectorizer with a linear classifier into a Pipeline
# instance. Its variable should be named `pipeline`.
pipeline = Pipeline([
    ('vec', vectorizer),
    ('clf', Perceptron()),
])

# TASK: Fit the pipeline on the training set
pipeline.fit(docs_train, y_train)

# TASK: Predict the outcome on the testing set in a variable named y_predicted
y_predicted = pipeline.predict(docs_test)

# Print the classification report
Example #39
0
print 'Data loaded!'
print


# Split datasets
y_L1 = data_train.target
y_L2_ca = ca_train.target
y_L2_collect = collect_train.target
y_L2_cookies = cookies_train.target
y_L2_share = share_train.target


# Extract features
print "Extracting features from Layer 1 training set using a sparse vectorizer..."
t0 = time()
vectorizer = Vectorizer()
X_L1 = vectorizer.fit_transform(data_train.data)
print "Done in %0.3fs" % (time() - t0)
print "L1:      n_samples: %d, n_features: %d" % X_L1.shape
print

print "Extracting features from Layer 2 training sets using the same vectorizer..."
t0 = time()
X_L2_ca = vectorizer.transform(ca_train.data)
X_L2_collect = vectorizer.transform(collect_train.data)
X_L2_cookies = vectorizer.transform(cookies_train.data)
X_L2_share = vectorizer.transform(share_train.data)
print "Done in %0.3fs" % (time() - t0)
print "CA:      n_samples: %d, n_features: %d" % X_L2_ca.shape
print "Collect: n_samples: %d, n_features: %d" % X_L2_collect.shape
print "Cookies: n_samples: %d, n_features: %d" % X_L2_cookies.shape
print 'Data loaded!'
print


# Split datasets
y_L1 = data_train.target
y_L2_ca = ca_train.target
y_L2_collect = collect_train.target
y_L2_cookies = cookies_train.target
y_L2_share = share_train.target


# Extract features
print "Extracting features from Layer 1 training set using a sparse vectorizer..."
t0 = time()
vectorizer = Vectorizer()
X_L1 = vectorizer.fit_transform(data_train.data)
print "Done in %0.3fs" % (time() - t0)
print "L1:      n_samples: %d, n_features: %d" % X_L1.shape
print

print "Extracting features from Layer 2 training sets using the same vectorizer..."
t0 = time()
X_L2_ca = vectorizer.transform(ca_train.data)
X_L2_collect = vectorizer.transform(collect_train.data)
X_L2_cookies = vectorizer.transform(cookies_train.data)
X_L2_share = vectorizer.transform(share_train.data)
print "Done in %0.3fs" % (time() - t0)
print "CA:      n_samples: %d, n_features: %d" % X_L2_ca.shape
print "Collect: n_samples: %d, n_features: %d" % X_L2_collect.shape
print "Cookies: n_samples: %d, n_features: %d" % X_L2_cookies.shape
Example #41
0
#    'svc': [{'probability': True}],
}

# split a training set and a test set
iter = ShuffleSplit(num_posts, n_iterations=1, test_fraction=0.15, indices=False)
for (iter_no, (train_index, test_index)) in enumerate(iter):
    print 'Iteration no. %d' %(iter_no + 1)
    y_train = np.array([ x for (x, y) in zip(all_data['target'], train_index) if y ])
    y_test  = np.array([ x for (x, y) in zip(all_data['target'], test_index) if y ])
    print 'Sampled %d training and %d test posts' %(len(y_train), len(y_test))

    print "Extracting features from the training dataset using a sparse vectorizer"
    t0 = time()
    title_vectorizer = Vectorizer(
        analyzer=WordNGramAnalyzer(
            charset='utf-8', 
            stop_words=set(['a', 'an', 'and', 'in', 'is', 'of', 'on', 'the', 'to']),
            )
        )
    title_train = title_vectorizer.fit_transform([ x for (x, y) in zip(all_data['title'], train_index) if y ])
    
    domain_vectorizer = extract.SimpleVectorizer()
    domain_train = domain_vectorizer.fit_transform([ x for (x, y) in zip(all_data['domain'], train_index) if y ])
    X_train = title_train
    print "done in %fs" % (time() - t0)
    print "n_samples: %d, n_features: %d" % X_train.shape
    print

    print "Extracting features from the test dataset using the same vectorizer"
    t0 = time()
    title_test = title_vectorizer.transform([ x for (x, y) in zip(all_data['title'], test_index) if y ])
    domain_test = domain_vectorizer.transform([ x for (x, y) in zip(all_data['domain'], test_index) if y ])
print 'data loaded'

documents = data_train.data + data_test.data
target_names = set(data_train.target_names + data_test.target_names)

print "%d documents" % len(documents)
print "%d categories" % len(target_names)
print

# split a training set and a test set
labels = np.concatenate((data_train.target, data_test.target))
true_k = np.unique(labels).shape[0]

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(documents)

X = Normalizer(norm="l2", copy=False).transform(X)

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print

###############################################################################
# Now sparse MiniBatchKmeans

mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
                       chunk_size=1000)
print "Clustering sparse data with %s" % str(mbkm)
t0 = time()
Example #43
0
    shuffle=True,
    random_state=42)
print 'Data loaded!'
print

# Split datasets
y_L1 = data_train.target
y_L2_ca = ca_train.target
y_L2_collect = collect_train.target
y_L2_cookies = cookies_train.target
y_L2_share = share_train.target

# Extract features
print "Extracting features from Layer 1 training set using a sparse vectorizer..."
t0 = time()
vectorizer = Vectorizer()
X_L1 = vectorizer.fit_transform(data_train.data)
print "Done in %0.3fs" % (time() - t0)
print "L1:      n_samples: %d, n_features: %d" % X_L1.shape
print

print "Extracting features from Layer 2 training sets using the same vectorizer..."
t0 = time()
X_L2_ca = vectorizer.transform(ca_train.data)
X_L2_collect = vectorizer.transform(collect_train.data)
X_L2_cookies = vectorizer.transform(cookies_train.data)
X_L2_share = vectorizer.transform(share_train.data)
print "Done in %0.3fs" % (time() - t0)
print "CA:      n_samples: %d, n_features: %d" % X_L2_ca.shape
print "Collect: n_samples: %d, n_features: %d" % X_L2_collect.shape
print "Cookies: n_samples: %d, n_features: %d" % X_L2_cookies.shape
Example #44
0
dataset = fetch_20newsgroups(subset='all',
                             categories=categories,
                             shuffle=True,
                             random_state=42)

print "%d documents" % len(dataset.data)
print "%d categories" % len(dataset.target_names)
print

labels = dataset.target
true_k = np.unique(labels).shape[0]

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_df=0.95, max_features=10000)
X = vectorizer.fit_transform(dataset.data)

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print

###############################################################################
# Do the actual clustering

if opts.minibatch:
    km = MiniBatchKMeans(k=true_k,
                         init='k-means++',
                         n_init=1,
                         init_size=1000,
                         batch_size=1000,
def main(args):
    scorer = make_scorer(my_scorer)
    vectorizer = Vectorizer()
    epochs = 100
    valid_pct = 0.2
    default_lr = 0.1
    default_c = 0.1
    default_decay = 0.0
    hidden_nodes = 128
    batch_size = 64

    if torch.cuda.is_available():
        print("CUDA found so processing will be done on the GPU")

    for cat1ind in range(len(cats)-1):
        cat1 = cats[cat1ind]
        for cat2ind in range(cat1ind+1,len(cats)):
            cat2 = cats[cat2ind]
            subcats = [cat1, cat2]
            newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=subcats)
            vectors = vectorizer.fit_transform(newsgroups_train.data)


            #vectors = vectors.toarray()
            scaler = StandardScaler(with_mean=False)
            scaler.fit(vectors)
            ## Doesn't seem to matter
            scaled_vectors = scaler.transform(vectors)
            ## Target vectors for the SVM:
            binary_targets = newsgroups_train.target
            ## Put targets in the range -1 to 1 instead of 0/1 for the nn hinge loss
            class_targets = newsgroups_train.target * 2 - 1

            train_X_tensor = Tensor(scaled_vectors.toarray())
            train_y_tensor = Tensor(class_targets)
            if torch.cuda.is_available():
                pyt_data = TensorDataset(train_X_tensor.cuda(),  train_y_tensor.cuda())
            else:
                pyt_data = TensorDataset(train_X_tensor,  train_y_tensor)


            print("Classifying %s vs. %s" % (cat1, cat2))
            end_train_range = int((1-valid_pct) * vectors.shape[0])

            ## Get NN performance
            iterations = 0
            train_X = pyt_data[:end_train_range][0]
            train_y = train_y_tensor[:end_train_range]
            valid_X = pyt_data[end_train_range:][0]
            valid_y = train_y_tensor[end_train_range:]

            my_lr = default_lr

            ####################################################################
            # Here is the actual svm
            ####################################################################
            print("Classifying with linear svm:")
            max_score = max_c = 0
            params = {'C':[0.01, 0.1, 1.0, 10.0, 100]}
            svc = svm.LinearSVC()
            clf = GridSearchCV(svc, params, scoring=scorer)
            clf.fit(vectors, binary_targets)
            print("Best SVM performance was with c=%f" % (clf.best_params_['C']))
            svc = svm.LinearSVC(C=clf.best_params_['C'])
            svc.fit(scaled_vectors[:end_train_range], binary_targets[:end_train_range])
            score = svc.score(scaled_vectors[end_train_range:], binary_targets[end_train_range:])
            print("** SVM score with standard validation set is **%f**" % (score))


            ####################################################################
            # Here are the different NN models
            ####################################################################
            for model_ind in (SVM_LIKE,):
                if model_ind == SVM_LIKE:
                    print("Classifying with svm-like (no hidden layer) neural network:")
                elif model_ind == ONE_LAYER:
                    print("Classifying with one hidden layer neural network with %d hidden nodes" % (hidden_nodes))
                elif model_ind == SVM_INIT:
                    print("Classifying with one hidden layer with %d hidden nodes initialized by previous system" % (hidden_nodes))
                elif model_ind == SVM_REG:
                    print("Classifying with one hidden layer with %d hidden nodes initialized and regularized by svm-like system." % (hidden_nodes))

                for try_num in range(5):
                    init_reg = False
                    weight_reg = L2VectorLoss()
                    if model_ind == SVM_LIKE:
                        model = SvmlikeModel(vectors.shape[1], lr=my_lr, c=default_c, decay=default_decay)
                        svmlike_model = model
                    elif model_ind == ONE_LAYER:
                        model = ExtendedModel(vectors.shape[1], hidden_nodes, lr=my_lr)
                    elif model_ind == SVM_INIT:
                        model = ExtendedModel(vectors.shape[1], hidden_nodes, lr=my_lr, init=saved_weights)
                    else:
                        model = ExtendedModel(vectors.shape[1], hidden_nodes, lr=my_lr, init=saved_weights)
                        init_reg = True
                        init_weight_reg = L2VectorLoss()

                    ## Move the model to the GPU:
                    if torch.cuda.is_available():
                        model.cuda()

                    valid_answer = model(Variable(valid_X)).cpu()
                    valid_acc = prev_valid_acc = accuracy_score(np.sign(valid_answer.data.numpy()), valid_y.numpy())
                    nan = False

                    for epoch in range(epochs):
                        #print("Epoch %d" % (epoch))
                    # single instance at a time:
                        nan = False
                        epoch_loss = 0
                        ## Shuffle data:
                        shuffle = torch.randperm(end_train_range)
                        train_X = train_X[shuffle]
                        train_y = train_y[shuffle]
                        for batch_ind in range(end_train_range // batch_size):
                            start_ind = batch_ind * batch_size
                            end_ind = min(start_ind+batch_size, end_train_range)
                            item = (train_X[start_ind:end_ind], train_y[start_ind:end_ind])
                            model.train();
                            iterations += 1
                            answer = model(Variable(item[0]))
                            if epoch == 0 and model_ind == SVM_INIT:
                                svm_answer = svmlike_model(Variable(item[0]))

                            loss = model.criterion(answer[:,0],  Variable(item[1]))
                            if np.isnan(loss.data[0]):
                                sys.stderr.write("Training batch %d at epoch %d has nan loss\n" % (batch_ind, epoch))
                                nan = True
                                break

                            epoch_loss += loss
                            loss.backward()

                            ## If we're on the model that regularizes towards the initial conditions then run
                            ## that loss function:
                            if init_reg and epoch > 0:
                                init_weight_reg_loss = init_weight_reg(model.fc1.weight[0,:], saved_weights)
                                init_weight_reg_loss.backward()

                            ## Always do L2 weight regularization:
                            #weight_reg_loss = weight_reg(model.fc1.weight, torch.zeros(model.fc1.weight.size()))
                            #weight_reg_loss.backward()

                            model.update()
                            #print("Epoch %d with loss %f and cumulative loss %f" % (epoch, loss.data[0], epoch_loss.data[0]))

                        if nan:
                            break

                        valid_batch = pyt_data[end_train_range:][0]
                        valid_answer = model(Variable(valid_batch))[:,0]
                        valid_loss = model.criterion(valid_answer, Variable(pyt_data[end_train_range:][1]))
                        #valid_f1 = f1_score(np.sign(valid_answer.data.numpy()), pyt_data[end_train_range:][1].numpy(), pos_label=-1)
                        valid_acc = accuracy_score(np.sign(valid_answer.cpu().data.numpy()), valid_y.numpy())
                        if epoch % 10 == 0: print("Epoch %d with training loss %f and validation loss %f, acc=%f" %
                            (epoch, epoch_loss.data[0], valid_loss.data[0], prev_valid_acc))
                        prev_valid_acc = valid_acc

                    if not nan:
                        print("** Finished with validation accuracy **%f**" % (valid_acc))
                        if model_ind == 0:
                            saved_weights = Variable(model.fc1.weight[0,:].data)
                        break
                    elif try_num+1 < 5:
                        my_lr /= 2.
                        print("Attempting another try (%d) with learning rate halved to %f" % (try_num+1, my_lr))
                    else:
                        print("Ran out of tries, giving up on this classification task.")
categories = ['Advertising','CA', 'Collect', 'Cookies', 'Security', 'Share', 
            'SafeHarbor','Truste', 'Change', 'Location', 'Children', 'Contact', 
            'Process', 'Retention']

# Load data
print "Loading privacy policy dataset for categories:"
print categories if categories else "all"
data_set = load_files('Privacypolicy_balance/raw', categories = categories,
                        shuffle = True, random_state = 42)
print 'data loaded'
print

# Extract features
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(data_set.data)
X = Normalizer(norm="l2", copy=False).transform(X)

y = data_set.target

# feature selection
ch2 = SelectKBest(chi2, k = 1800)
X = ch2.fit_transform(X, y)

X = X.toarray()

n_samples, n_features = X.shape
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % (n_samples, n_features)
print
    # The multiprocessing module is used as the backend of joblib.Parallel
    # that is used when n_jobs != 1 in GridSearchCV

    # the training data folder must be passed as first argument
    movie_reviews_data_folder = sys.argv[1]
    dataset = load_files(movie_reviews_data_folder, shuffle=False)
    print "n_samples: %d" % len(dataset.data)

    # split the dataset in training and test set:
    docs_train, docs_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_fraction=0.25, random_state=None)

    # TASK: Build a vectorizer / classifier pipeline using the previous
    # analyzer
    pipeline = Pipeline([
        ('vect', Vectorizer(max_features=100000, max_df=0.9)),
        ('clf', LinearSVC()),
    ])

    # TASK: Define a parameters grid for searching whether extracting bi-grams
    # is suited for this task, and which value of C in 1000 or 10000 is the
    # best for LinearSVC on this dataset.
    parameters = {
        'vect__max_n': (1, 2),
        'clf__C': (1000, 10000),
    }

    # TASK: Build a grid search to find out whether unigrams or bigrams are
    # more useful.
    # Fit the pipeline on the training set using grid search for the parameters
    # To make this run faster, fit it only on the top first 200 documents of