class TF_Transformer(base.BaseEstimator, base.TransformerMixin):
	def __init__(self):

		self.cv_bi = CountVectorizer(min_df=2,max_df=0.7,ngram_range=(1,2))
		self.tfidf_trans = TfidfTransformer()
		self.SVD_trans = TruncatedSVD(n_components=300)

    # X is a list of Fit_Review named tuples, y is none
	def fit(self, X, y=None):

		texts = [review.text for review in X]

		counts = self.cv_bi.fit_transform(texts)
		counts_tfidf = self.tfidf_trans.fit_transform(counts)
		self.SVD_trans.fit(counts_tfidf)

		return self

    # X is a list of either Fit_Review or Prod_Corpus named tuples
	def transform(self, X):

		texts = [review.text for review in X]

		counts = self.cv_bi.transform(texts)
		counts_tfidf = self.tfidf_trans.transform(counts)
		counts_trunc = self.SVD_trans.transform(counts_tfidf)

		return counts_trunc
Exemple #2
0
def text_sentiment(docs_new):
   docs_new=[docs_new]
   twenty_train= load_files('./Sentiment')  #the complete data is in this directory; like comp.graphics etc
   count_vect = CountVectorizer()
   X_train_counts = count_vect.fit_transform(twenty_train.data)
   tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
   X_train_tf = tf_transformer.transform(X_train_counts)
   tfidf_transformer = TfidfTransformer()
   X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

   # Fit a classifier on the training set
   #clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
   #f = open('my_classifier.pickle', 'wb')
   #pickle.dump(clf, f)
   #f = open('my_classifier.pickle',)
   #clf = pickle.load(f)
   #f.close()
   # save the classifier
   #with open('my_sentiment.pkl', 'wb') as fid:
      #cPickle.dump(clf, fid)    

   # load it again
   with open('my_sentiment.pkl', 'rb') as fid:
      clf = cPickle.load(fid)
   X_new_counts = count_vect.transform(docs_new)
   X_new_tfidf = tfidf_transformer.transform(X_new_counts)

   predicted = clf.predict(X_new_tfidf)
   return twenty_train.target_names[predicted]
def runSVCPipeline(entries, langs):
	t0 = time()
	svc_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42))])

	vect = CountVectorizer(ngram_range=(1,1), max_features=n_features)
	X_train_counts = vect.fit_transform(entries)
	tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tfidf = tfidf.transform(X_train_counts)

	clf = LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42)
	clf.fit(X_train_tfidf, langs)

	X_new_counts = vect.transform(entries)
	X_new_tfidf = tfidf.transform(X_new_counts)
	#dec = clf.decision_function([[1]])
	predicted = clf.predict(X_new_tfidf.toarray())

	print(np.mean(predicted == langs))
	print(metrics.classification_report(langs, predicted, target_names=langs))
	print(metrics.confusion_matrix(langs, predicted))
	print("Took %s seconds." % (time()-t0))
	print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
	return svc_pipeline
def getTfidfData(dataTrain, dataTest, dataHold):
    print dataTrain.target_names
    
    count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2)
    tfidf_transformer = TfidfTransformer(sublinear_tf=True)
    X_counts = count_vect.fit_transform(dataTrain.data)
    X_tfidf = tfidf_transformer.fit_transform(X_counts)
    print X_tfidf.shape
    
    Y_counts = count_vect.transform(dataTest.data)
    Y_tfidf = tfidf_transformer.transform(Y_counts)
    print Y_tfidf.shape
    
    H_counts = count_vect.transform(dataHold.data)
    H_tfidf = tfidf_transformer.transform(H_counts)
    
    print 'feature selection using chi square test', len(dataTrain.target)
    feature_names = count_vect.get_feature_names()
    
    ch2 = SelectKBest(chi2, k='all')
    X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target)
    Y_tfidf = ch2.transform(Y_tfidf)
    H_tfidf = ch2.transform(H_tfidf)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
        
    if feature_names:
        feature_names = numpy.asarray(feature_names)
        print 'important features'
        print feature_names[:10]
    return X_tfidf, Y_tfidf, H_tfidf
def tf(train,test):
    """Transform feature vectors: TF"""
    trf = TfidfTransformer(use_idf=False)
    trf = trf.fit(train)
    train = trf.transform(train)
    test = trf.transform(test)
    return train,test
Exemple #6
0
class BagOfWordView(View):
    """
    View that process words(stemming, lowercasing) and count each word's frequency
    """
    def __init__(self, *args, **kwargs):
        self.count_vec1 = None
        self.count_vec2 = None

        self.tfidf_vec1 = None
        self.tfidf_vec2 = None

        super(BagOfWordView, self).__init__(*args, **kwargs)

    def fit(self, v1, v2, use_idf=False):
        """
        v1, v2: both should be string|unicode, which is required by CountVectorizer.fit
        """
        ## TODO: add `use_tf` option
        self.count_vec1 = CountVectorizer().fit(v1)
        self.count_vec2 = CountVectorizer().fit(v2)

        self.tfidf_vec1 = TfidfTransformer(use_idf=use_idf).fit(
            self.count_vec1.transform(v1))

        self.tfidf_vec2 = TfidfTransformer(use_idf=use_idf).fit(
            self.count_vec2.transform(v2))

        return self

    def transform(self, v1, v2):
        return self.tfidf_vec1.transform(self.count_vec1.transform(v1)), \
            self.tfidf_vec2.transform(self.count_vec2.transform(v2))
def tfidf(train,test):
    """Transform feature vectors: TFIDF"""
    trf = TfidfTransformer()
    trf = trf.fit(train)
    train = trf.transform(train)
    test = trf.transform(test)
    return train,test
Exemple #8
0
def train_randomforest(train, test, n_estimators=10, cpus=4):
    import numpy as np
    from scipy.sparse import csc_matrix
    from sklearn.preprocessing import OneHotEncoder

    vocabulary_size = 2000
    #keep commas and colons

    corpus = [t.text for t in train]
    test_corpus = [t.text for t in test]

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer

    """
    prepare text training data
    """
    count_vect = CountVectorizer(max_features=None)
    X_train_counts = count_vect.fit_transform(corpus)
    X_test_counts = count_vect.transform(test_corpus)
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    X_test_tf = tf_transformer.transform(X_test_counts)
    
    X_names = []
    X_train = []
    y_train = []
    for card, token_text in zip(train, X_train_tf):
        X_names.append(card.name)
        features = np.concatenate((token_text.toarray().flatten(), card.types, [card.power, card.toughness, card.loyalty], card.colors))
        X_train.append(features)
        y_train.append(card.cost)

    X_test = []
    y_test = []
    X_test_names = []
    for card, token_text in zip(test, X_test_tf):
        X_test_names.append(card.name)
        features = np.concatenate((token_text.toarray().flatten(), card.types, [card.power, card.toughness, card.loyalty], card.colors))
        X_test.append(features)
        y_test.append(card.cost)

    X_train = np.asarray(X_train)
    y_train = np.asarray(y_train)
    X_test = np.asarray(X_test)
    y_test = np.asarray(y_test)
    
    from sklearn.ensemble import RandomForestRegressor
    from sklearn import cross_validation
    rf = RandomForestRegressor(n_estimators=n_estimators, n_jobs=cpus)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_train)
    print y_pred.shape, y_train.shape
    print "naive train loss", np.mean(custom_loss(y_train, y_pred))
    y_pred = rf.predict(X_test)
    print "naive test loss", np.mean(custom_loss(y_test, y_pred))
    result = print_predictions(y_pred, y_test, X_test_names)
    print "saving to output.naive.txt and output.naive.p"
    pickle.dump(result, open('output.naive.p', 'wb'))
def test_transformer_idf_setter():
    X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
    orig = TfidfTransformer().fit(X)
    copy = TfidfTransformer()
    copy.idf_ = orig.idf_
    assert_array_equal(
        copy.transform(X).toarray(),
        orig.transform(X).toarray())
class NaiveBayesClassifier(object):
    '''
    classdocs
    '''
        
    def __init__(self):
        
        self.classifier = MultinomialNB()
        #self.model = None
        
    def trainClassifier(self, trainingDocs, labels):
        self.trainingDocs = trainingDocs
        self.labels = labels
        
        self.count_vect = CountVectorizer(stop_words='english')
        X_train_counts = self.count_vect.fit_transform(self.trainingDocs)
        self.tf_transformer = TfidfTransformer(use_idf=True,sublinear_tf=True).fit(X_train_counts)
        X_train_tf = self.tf_transformer.transform(X_train_counts)
        
        self.ch2 = SelectKBest(chi2)
        X_train = self.ch2.fit_transform(X_train_tf, self.labels)
        
        #self.classifier.fit(X_train_tf, self.labels)
        self.classifier.fit(X_train, self.labels)
        
    def classify(self, docs_new):
        X_new_counts = self.count_vect.transform(docs_new)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        X_test = self.ch2.transform(X_new_tfidf)
        #predicted = self.model.predict(X_new_tfidf)
        #self.predicted = self.classifier.predict(X_new_tfidf)
        self.predicted = self.classifier.predict(X_test)
        #for doc, category in zip(docs_new, self.predicted):
        #    print '%r => %s' % (doc,category)
        return self.predicted
    
    def calculate_score(self, doc_new):
        doc_list = [doc_new]
        #doc_list.append(doc_new)
        X_new_counts = self.count_vect.transform(doc_list)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        X_test = self.ch2.transform(X_new_tfidf)
        self.predicted = self.classifier.predict(X_test)
        return self.predicted
        #predicted_prob_all = self.classifier.predict_proba(X_test)
        #predicted_prob = [max(pr) for pr in predicted_prob_all]
        #return predicted_prob
    
    def score(self,docs_test,labels):
        X_new_counts = self.count_vect.transform(docs_test)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        
        X_test = self.ch2.transform(X_new_tfidf)
        #self.predicted = self.classifier.predict(X_new_tfidf)
        self.predicted = self.classifier.predict(X_test)
        accuracy = np.mean(self.predicted == labels)
        #accuracy = self.classifier.score(X_new_tfidf, labels)
        return accuracy
Exemple #11
0
def tfidf_preprocessor(*args):
  x_train = args[0]
  x_test = args[1]
  x_train = [x.doc_2_vec for x in x_train]
  x_test = [x.doc_2_vec for x in x_test]
  tfidf_model = TfidfTransformer().fit(x_train)
  x_train_tfidf = tfidf_model.transform(x_train)
  x_test_tfidf = tfidf_model.transform(x_test)
  return x_train_tfidf, x_test_tfidf
Exemple #12
0
def TextTransform(X, Xtest = None):
    Write("Process Data with TFIDF...\n")
    tfidf = TfidfTransformer()
    if Xtest is None:
        X = tfidf.fit_transform(X).toarray()
        return X
    else:
        tfidf.fit(X)
        return tfidf.transform(X).toarray(), tfidf.transform(Xtest).toarray()
Exemple #13
0
class feature1:
	def __init__(self):
		self.count_vect = CountVectorizer(input='content',ngram_range=(2,3), min_df=0.2, max_df=1.0)
	def preprocess_X(self, X):
		X = [ans_to_tag[ans] for ans in X]
		X_train_counts = self.count_vect.fit_transform(X)
		self.tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
		X_train_tf = self.tf_transformer.transform(X_train_counts)
		return X_train_tf
	def preprocess_Y(self, Y):
		X_new_counts = self.count_vect.transform(Y)
		return self.tf_transformer.transform(X_new_counts)
class OneClassClassifier(object):
    '''
    classdocs
    '''


    def __init__(self):
        '''
        Constructor
        '''
        self.classifier = svm.OneClassSVM( kernel="rbf", gamma=0.0)#(nu=0.1, kernel="rbf", gamma=0.1)
        
    def trainClassifier(self, trainingDocs,labels):
        #self.trainingDocs = trainingDocs
        #self.labels = labels
        
        self.count_vect = CountVectorizer(stop_words='english')
        #X_train_counts = self.count_vect.fit_transform(self.trainingDocs)
        X_train_counts = self.count_vect.fit_transform(trainingDocs)
        self.tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
        #self.tf_transformer = TfidfTransformer().fit(X_train_counts)
        X_train_tf = self.tf_transformer.transform(X_train_counts)
        
        self.ch2 = SelectKBest(chi2,k=100)
        X_train = self.ch2.fit_transform(X_train_tf, labels)
        
        #self.classifier.fit(X_train_tf, self.labels)
        self.classifier.fit(X_train)
    
    def calculate_score(self, doc_new):
        doc_list = [doc_new]
        #doc_list.append(doc_new)
        X_new_counts = self.count_vect.transform(doc_list)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        #X_test = self.ch2.transform(X_new_tfidf)
        X_test = X_new_tfidf
        self.predicted = self.classifier.predict(X_test)
        return self.predicted
    
    def score(self,docs_test,labels):
        '''
        Here labels are 1 and -1
        '''
        X_new_counts = self.count_vect.transform(docs_test)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        
        X_test = self.ch2.transform(X_new_tfidf)
        #X_test = X_new_tfidf
        self.predicted = self.classifier.predict(X_test)
        print self.predicted
        accuracy = np.mean(self.predicted == labels)
        #accuracy = self.classifier.score(X_new_tfidf, labels)
        return accuracy
def svm_bench():
    data_file = "./data/dataset.pkl"
    train_set, valid_set, test_set, word2id, pop2id, type2id = dataset.load_data(data_file)

    train_set_x, train_set_y = train_set
    train_set_pop_y, train_set_type_y, train_set_loc_y = train_set_y

    valid_set_x, valid_set_y = valid_set
    valid_set_pop_y, valid_set_type_y, valid_set_loc_y = valid_set_y
    
    test_set_x, test_set_y = test_set
    test_set_pop_y, test_set_type_y, test_set_loc_y = test_set_y
    
    id2word = {v:k for k,v in word2id.items()}
    word_train_set_x = [sen_dig2word(doc, id2word) for doc in train_set_x]
    word_valid_set_x = [sen_dig2word(doc, id2word) for doc in valid_set_x]
    word_test_set_x = [sen_dig2word(doc, id2word) for doc in test_set_x]
    
    # construct the word count matrix
    
    # construct the word count matrix
    count_vect = CountVectorizer()
    x_train_count = count_vect.fit_transform(word_train_set_x)
    x_valid_count = count_vect.transform(word_valid_set_x)
    x_test_count = count_vect.transform(word_test_set_x)

    tfidf_transformer = TfidfTransformer()
    x_train_tfidf = tfidf_transformer.fit_transform(x_train_count)
    x_valid_tfidf = tfidf_transformer.transform(x_valid_count)
    x_test_tfidf = tfidf_transformer.transform(x_test_count)

    # train the pop model
    pop_clf = svm.LinearSVC().fit(x_train_tfidf, train_set_pop_y)
    pop_pred = pop_clf.predict(x_valid_tfidf)
    pop_pred_test = pop_clf.predict(x_test_tfidf)

    # compute the performance
    pop_errors = np.mean(np.not_equal(pop_pred, valid_set_pop_y))
    pop_errors_test = np.mean(np.not_equal(pop_pred_test, test_set_pop_y))

    # train the event type model
    type_clf = svm.LinearSVC().fit(x_train_tfidf, train_set_type_y)
    type_pred = type_clf.predict(x_valid_tfidf)
    type_pred_test = type_clf.predict(x_test_tfidf)

    # compute the performance
    type_errors = np.mean(np.not_equal(type_pred, valid_set_type_y))
    type_errors_test = np.mean(np.not_equal(type_pred_test, test_set_type_y))

    print "SVM Valid--> Type error: %0.2f, Popuation error: %0.2f" % (type_errors, pop_errors)
    print "SVM Tes--> Type error: %0.2f, Popuation error: %0.2f" % (type_errors_test, pop_errors_test)
Exemple #16
0
def cross_val_score(clf, data, target, k):
	shuffle_arr = []
	size = len(data)
	for i in range(size):
		shuffle_arr.append(i)
	scores = []
	for i in range(0, k):
		#generate shuffled train and test dataset
		data_train_raw = []
		data_test_raw = []
		target_train = []
		target_test = []
		# seperate shuffled train and test dataset
                random.shuffle(shuffle_arr)
                shuffle_train = shuffle_arr[:size - size/k]
                shuffle_test = shuffle_arr[size-size/k :]
                for j in shuffle_train:
                        data_train_raw.append(data_total[j])
                        target_train.append(target[j])
                for r in shuffle_test:
                        data_test_raw.append(data_total[r])
                        target_test.append(target[r])

		data_train = data_process(data_train_raw)
		data_test = data_process(data_test_raw)

		# transform array of string to counts
		count_vect = CountVectorizer()
		X_train_counts = count_vect.fit_transform(data_train)
		# transform counts to frequencies
		tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
		X_train_tf = tf_transformer.transform(X_train_counts)
		
		# feature selection
		select = SelectPercentile(chi2, percentile = 10)
		X_train_fs = select.fit_transform(X_train_tf, target_train)
							
		# train the model
		clf_train = clf.fit(X_train_fs, target_train)

		# test the model
		X_new_counts = count_vect.transform(data_test)
		X_new_tfidf = tf_transformer.transform(X_new_counts)
		X_new_fs = select.transform(X_new_tfidf)
		test_result = clf_train.predict(X_new_fs)
		scores.append(GetPrecisionRecallF1(test_result, target_test))
		#clf_score =  clf_train.score(X_new_fs, target_test)
		#scores.append(clf_score)
	return scores
Exemple #17
0
def tfidf_step_by_step():
    """ Example of calculating TF-IDF for OSM nodes.
    Document is a list of keys.
    """

    learn_data_set = documents_gen()
    test_data_set = documents_gen()

    # calculate term-frequency
    vectorizer = CountVectorizer(stop_words=stop_words,
        token_pattern='[a-z0-9_\-:]+')
    vectorizer.fit_transform(learn_data_set)
    #pprint.pprint(vectorizer.vocabulary_)

    # freq_term_matrix is a sparse matrix (elemens stored in Coordinate format
    # http://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_.28COO.29 )
    freq_term_matrix = vectorizer.transform(test_data_set)
    # freq_term_matrix.todense()

    # l2 - Euclidean normalization
    # http://en.wikipedia.org/wiki/Norm_%28mathematics%29#Euclidean_norm
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)

    tf_idf = tfidf.transform(freq_term_matrix)

    pprint.pprint(tf_idf.todense())
Exemple #18
0
def estimation(file='song_text.txt', separator=u'--text--'):
    arr = text_split_line(file, u'--text--')
    dvect = data_vector(arr)
    target = dvect[0]
    text = dvect[1]
    dic = dvect[2]      # for converting target integer to artist name
#    print (target)
#    print (dic)
    count_vect = CountVectorizer()
    word_vect = count_vect.fit_transform(text)
    tfidf_transformer = TfidfTransformer()
    vect_tfidf = tfidf_transformer.fit_transform(word_vect)
    machine = svm.SVC(probability=True) # one of the best for text, see tutorial working with text
    machine.fit(vect_tfidf, target)
    print (machine.score(vect_tfidf, target))
    prediction = machine.predict(vect_tfidf)        # accuracy test (tutorial)
    print (u'model predictive accuracy:  {:.1%}'
           .format(np.mean(prediction == target)))
    new_texts = [text[500], text[2345], text[-2], text[0], text[5893]]
    new_data = count_vect.transform(new_texts)
    new_tfidf = tfidf_transformer.transform(new_data)
    prediction = machine.predict(new_tfidf)
    for i in range(len(new_texts)):
        print (u'{}\t=> {}'.format(new_texts[i].splitlines()[:2],
                                  dic[prediction[i]]))
    return
Exemple #19
0
def race_tfidf(data, can_be_noun_arg, stop_words):
    print 
    data = data.groupby('race')['last']
    data = dict(list(data))
    docs = []
    for k in data:
        docs.append(' '.join(data[k]))
    count_vectorizer = CountVectorizer(stop_words='english')
    counts = count_vectorizer.fit_transform(docs)
    #print counts.todense().shape
    tfidf = TfidfTransformer(norm="l2", sublinear_tf='True')
    tfidf.fit(counts)
    #print "IDF:", tfidf.idf_.shape
    tf_idf_matrix = tfidf.transform(counts)
    freqs = {}
    sorted_voc = sorted(count_vectorizer.vocabulary_.iteritems(), key=operator.itemgetter(1))
    terms,_ = zip(*sorted_voc)
    for i,k in enumerate(data.keys()):
        # make list
        row = np.array(tf_idf_matrix.todense()[i,:])[0].tolist()
        freq = zip(terms, row)
        freqs[k] = sorted(freq, reverse=True, key=lambda x: x[1])
        print freqs[k][:5]
    #print tf_idf_matrix.todense().shape
    return freqs
Exemple #20
0
class VectorModel(object):
    
    def __init__(self , list_of_comments=None):
        self.__list_of_comments = list_of_comments
        self.__vectorizer = []
        self.__corpus_simple_vector = []
        self.__transformer = []
        self.__corpus_tf_idf = []
        #self.prepare_models()
    
    def prepare_models(self):
        self.__vectorizer = CountVectorizer()
        vector = self.__vectorizer.fit_transform(self.__list_of_comments)
        self.__corpus_simple_vector = vector.toarray()
        self.__transformer = TfidfTransformer()
        tfidf = self.__transformer.fit_transform(self.__corpus_simple_vector)
        self.__corpus_tf_idf = tfidf.toarray()
        return [self.__vectorizer , self.__corpus_simple_vector , self.__transformer , self.__corpus_tf_idf]
    
    def set_models(self , vectorizer , transformer):    
        self.__vectorizer = vectorizer
        self.__transformer = transformer
        
    
    def get_comment_frequency_vector(self , comments):
        vec_comments = []
        for i in comments:
            vec_comments.append(i)
        vectores = self.__vectorizer.transform(vec_comments).toarray()
        return vectores
    
    def get_comment_tf_idf_vector(self , comments):
        vector = self.get_comment_frequency_vector(comments)
        result = self.__transformer.transform(vector).toarray()
        return result
Exemple #21
0
class UnitClassifier(Trainer):
    def __init__(self, x, y, train_ratio):
        super(UnitClassifier, self).__init__(x, y, train_ratio)
        self._count_vec = CountVectorizer()
        self._tfidf_transformer = TfidfTransformer()

    def Fit(self):
        x_count = self._count_vec.fit_transform(self._x_train)
        self._tfidf_transformer.fit(x_count)

    def Preprocess(self, x):
        return self._tfidf_transformer.transform(self._count_vec.transform(x))

    def Learn(self, x_train, y_train):
        LOG.info('x_train.shape = %s', str(x_train.shape))
        LOG.info('len(y_train) = %d', len(y_train))

        clf = RandomForestClassifier(verbose=0, n_jobs=-1, n_estimators=20)
        LOG.info('Training...')
        clf.fit(x_train, y_train)
        LOG.info('Done...')
        return clf

    def Eval(self):
        LOG.info('Eval ...')
        y_pred = self.Predict(self._x_test)
        return {
            'misclass': np.mean(y_pred != self._y_test),
            'report': classification_report(self._y_test, y_pred,
                                            target_names=self._model.classes_)
        }
Exemple #22
0
class CaloriesRegressor(Trainer):
    def __init__(self, x, y, train_ratio):
        super(CaloriesRegressor, self).__init__(x, y, train_ratio)
        self._count_vec = CountVectorizer()
        self._tfidf_transformer = TfidfTransformer()

    def Fit(self):
        x_count = self._count_vec.fit_transform(self._x_train)
        self._tfidf_transformer.fit(x_count)

    def Preprocess(self, x):
        return self._tfidf_transformer.transform(self._count_vec.transform(x))

    def Learn(self, x_train, y_train):
        LOG.info('x_train.shape = %s', str(x_train.shape))
        LOG.info('len(y_train) = %d', len(y_train))

        clf = RandomForestRegressor(verbose=0, n_jobs=-1, n_estimators=100)
        LOG.info('Training...')
        clf.fit(x_train, y_train)
        LOG.info('Done...')
        return clf

    def Eval(self):
        LOG.info('Eval ...')
        y_pred = self.Predict(self._x_test)
        return {
            'median_absolute_error':
            median_absolute_error(self._y_test, y_pred),
            'mean_squared_error': mean_squared_error(self._y_test, y_pred),
            'explained_variance_score':
            explained_variance_score(self._y_test, y_pred),
        }
Exemple #23
0
def check_webshell(clf,dir):
    all=0
    all_php=0
    webshell=0

    webshell_files_list = load_files_re(webshell_dir)
    CV = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", max_features=max_features,
                         token_pattern=r'\b\w+\b', min_df=1, max_df=1.0)
    x = CV.fit_transform(webshell_files_list).toarray()

    transformer = TfidfTransformer(smooth_idf=False)
    transformer.fit_transform(x)


    g = os.walk(dir)
    for path, d, filelist in g:
        for filename in filelist:
            fulepath=os.path.join(path, filename)
            t = load_file(fulepath)
            t_list=[]
            t_list.append(t)
            x2 = CV.transform(t_list).toarray()
            x2 = transformer.transform(x2).toarray()
            y_pred = clf.predict(x2)
            all+=1
            if filename.endswith('.php'):
                all_php+=1
            if y_pred[0] == 1:
                print "%s is webshell" % fulepath
                webshell+=1

    print "Scan %d files(%d php files),%d files is webshell" %(all,all_php,webshell)
def extract_text_features(train_data, test_data):
    """
    Returns one types of training and test data features.
        1) Term Frequency times Inverse Document Frequency (tf-idf): X_train_tfidf, X_test_tfidf

    Parameters
    ----------
    train_data : List[str]
        Training data in list. Will only take 30000 reviews for efficiency purposes
    test_data : List[str]
        Test data in list

    Returns
    -------
    Tuple(scipy.sparse.csr.csr_matrix,.., list)
        Returns X_train_tfidf, X_test_tfidf, vocab as a tuple.
    """
    
    # set up a count vectorizer that removes english stopwords when building a term-doc matrix
    count_vect = CountVectorizer(stop_words=set(stopwords.words('english')))
    # build the term frequency per document matrix from a random sublist of 30,000 documents
    train_counts = count_vect.fit_transform(random.sample(train_data, 30000))
    test_counts = count_vect.transform(test_data)
    tfidf_transformer = TfidfTransformer()

    train_tfidf = tfidf_transformer.fit_transform(train_counts)
    test_tfidf = tfidf_transformer.transform(test_counts)
    
    vocab = count_vect.get_feature_names()
    
    return (train_tfidf, test_tfidf, vocab)
def load_dataset(prefix, sufix, dic_fn, vocab_fn='./data/english_review.trn-100000.vocab'):
    train_file = prefix + "_train.txt.tok"
    test_file = prefix + "_test.txt.tok"

    train_y_file = prefix + "_train." + sufix
    test_y_file = prefix + "_test." + sufix

    dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))}
    word_train_set = [l.strip().lower() for l in open(train_file)]
    word_test_set = [l.strip().lower() for l in open(test_file)]

    train_y = [dic_cn[l.strip()] for l in open(train_y_file)]
    test_y = [dic_cn[l.strip()] for l in open(test_y_file)]
    
    vocab = [l.strip().lower().split("\t")[0] for l in open(vocab_fn)]
    count_vect = CountVectorizer(vocabulary=vocab)
    train_set_count = count_vect.fit_transform(word_train_set)
    test_set_count = count_vect.transform(word_test_set)
    tfidf_transformer = TfidfTransformer()
    train_set_x = tfidf_transformer.fit_transform(train_set_count).toarray()
    test_set_x = tfidf_transformer.transform(test_set_count).toarray()

    train_shared_x, train_shared_y = shared_dataset([train_set_x, train_y]) 
    test_shared_x, test_shared_y = shared_dataset([test_set_x, test_y]) 
    return [(train_shared_x, train_shared_y), (test_shared_x, test_shared_y)]
Exemple #26
0
def test_classifiers():
    print "running bayes classifier.."
    # train_bayesian_classifier_from_scratch()

    dataset = get_thing_from_file("training_dataset.txt")
    print dataset.target_names
    bayes = get_thing_from_file("bayes.txt")
    bayes_model = bayes.fit(dataset.data, dataset.target)
    bayes_model = get_thing_from_file("bayes_model.txt")

    results = []
    count = 0
    url_arr = []

    bayes_predicted = bayes_model.predict(dataset)

    # for url in get_test_articles():
    #     url_arr.append(url)

    article_arr = get_article_array(url_arr)

    docs_new = ['God is love', 'OpenGL on the GPU is fast']

    count_vect = CountVectorizer()
    tfidf_trans = TfidfTransformer()
    x_new_counts = count_vect.transform(docs_new)
    x_new_horse = tfidf_trans.transform(x_new_counts)

    predicted = bayes_model.predict(x_new_horse)

    for doc, category in zip(docs_new, predicted):
        print('%r => %s' % (doc, dataset.target_names[category]))
def cal_product_description_tfidf():
    #PART II compute the tf-idf for product description
    print "\nBegins,compute the tf-idf for product description ..."
    product_description_data = pd.read_csv('product_descriptions.csv')

    print "\nMerge the product description into database..."
    AllSet = pd.merge( AllSet , product_description_data, how='left', on='product_uid')

    print "\nStemming the product description ..."
    AllSet['product_description'] = AllSet['product_description'].map(lambda x: stem_process(x))
    product_description=AllSet['product_description']

    print "\nGet the (product description vocabulary)-(search term) frequency matrix..."
    search_vect_descrip = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_descrip.fit(product_description)#learn the vocabulary
    search_descrip_fq_matrix = search_vect_descrip.transform(search_term) #get the (product description vocabulary)-(search term) frequency matrix

    print "\nGet the (product description vocabulary)-(product_description) frequency matrix..."
    description_vect = CountVectorizer(stop_words ='english')
    description_vect.fit_transform(product_description)#learn the vocabulary
    description_fq_matrix=description_vect.transform(product_description) #get the (product discription vocabulary)-(product_description) frequency matrix

    print "\nGet the idf matrix..."
    tfidf_transformer = TfidfTransformer(norm="l2",smooth_idf=True)
    tfidf_transformer.fit(description_fq_matrix) # get idf for each vocabulary
    tf_idf_descrip_matrix  = tfidf_transformer.transform(description_fq_matrix) #get the idf matrix


    print "\nCompute the result of tf-idf for product description ..."
    tf_idf_descrip_result=[]#compute the result of tf-idf for product title
    for index in range(tf_idf_descrip_matrix.shape[0]):
        tf_idf_descrip_result.append((np.multiply(tf_idf_descrip_matrix[index], search_descrip_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id":AllSet['id'],"product_description_tfidf": tf_idf_descrip_result}).to_csv('product_description_tfidf.csv', index=False)
def cal_product_title_tfidf():

    #PART I compute the tf-idf for product title
    print "\nBegins,compute the tf-idf for product title ..."


    print "\nStemming product_title..."
    AllSet['product_title'] = AllSet['product_title'].map(lambda x : stem_process(x))
    product_title = AllSet['product_title']

    print "\nGet the (product title vocabulary)-(search term) frequency matrix..."
    search_vect_tittle = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_tittle.fit(product_title)#learn the vocabulary
    search_tittle_fq_matrix = search_vect_tittle.transform(search_term) #get the (product title vocabulary)-(search term) frequency matrix

    print "\nGet the (product title vocabulary)-(product_title) frequency matrix"
    title_vect = CountVectorizer(stop_words='english')
    title_vect.fit_transform(product_title)#learn the vocabulary
    title_fq_matrix = title_vect.transform(product_title) #get the (product title vocabulary)-(product_title) frequency matrix

    print "\nGet the idf matrix"
    tfidf_transformer = TfidfTransformer(norm="l2", smooth_idf=True)
    tfidf_transformer.fit(title_fq_matrix) # get idf for each vocabulary
    tf_idf_title_matrix = tfidf_transformer.transform(title_fq_matrix) #get the idf matrix

    print "\nCompute the result of tf-idf for product title ..."
    tf_idf_title_result = [] #compute the result of tf-idf for product title
    for index in range(tf_idf_title_matrix.shape[0]):
        tf_idf_title_result.append((np.multiply(tf_idf_title_matrix[index], search_tittle_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id": AllSet['id'],"product_title_tfidf": tf_idf_title_result}).to_csv('product_title_tfidf.csv', index=False)

    return 0
def bayes_tfidf(prefix, sufix, dic_fn):
    """
    prefix example: ./data/single_label_sen/sen_spanish_protest
    sufix example: pop_cat
    """

    train_file = prefix + "_train.txt.tok"
    test_file = prefix + "_test.txt.tok"

    train_y_file = prefix + "_train." + sufix
    test_y_file = prefix + "_test." + sufix
    
    dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))}


    word_train_set = [l.strip().lower() for l in open(train_file)]
    word_test_set = [l.strip().lower() for l in open(test_file)]

    train_y = [dic_cn[l.strip()] for l in open(train_y_file)]
    test_y = [dic_cn[l.strip()] for l in open(test_y_file)]

    # construct the word count matrix
    count_vect = CountVectorizer()
    train_set_count = count_vect.fit_transform(word_train_set)
    test_set_count = count_vect.transform(word_test_set)

    # construct tfidf matrix
    tfidf_transformer = TfidfTransformer()
    train_set_x = tfidf_transformer.fit_transform(train_set_count)
    test_set_x = tfidf_transformer.transform(test_set_count)

    print "start the model"
    test_score = bayes_experiment([train_set_x, train_y], [test_set_x, test_y])
    return test_score
Exemple #30
0
def tfidf_score(train_set, test_set):

    stopwords = nltk.corpus.stopwords.words('english')
    vectorizer = TfidfVectorizer(min_df=1, stop_words=set(stopwords))
    #Remove all the None Types from the input datasets
    train_set = filter(None, train_set)
    test_set = filter(None, test_set)
    vectorizer.fit_transform(train_set)
    #print "Word Index is {0} \n".format(vectorizer.vocabulary_)
    smatrix = vectorizer.transform(test_set)
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(smatrix)
    #print "IDF scores:", tfidf.idf_
    tf_idf_matrix = tfidf.transform(smatrix)
    pairwise_similarity = tf_idf_matrix * tf_idf_matrix.T
    msum = tf_idf_matrix.sum(axis=1)
    cos_sum = pairwise_similarity.sum(axis=1)
    mlist = msum.tolist()
    cos_sim = cos_sum.tolist()
    count = 0
    tfidfscores = {}
    for s in train_set:
        tfidfscores[s] = []
        tfidfscores[s].append(mlist[count][0])
        tfidfscores[s].append(cos_sim[count][0])
        count += 1
    return tfidfscores
Exemple #31
0
#
# See below for a simple example.
#
# **Example:**
#
# Consider a document containing 100 words wherein the word cat appears 3 times.
#
# The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10 million documents and the word cat appears in one thousand of these. Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
# ____
#
# Let's go ahead and see how we can do this in SciKit Learn:

from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

# We'll go ahead and check what is the IDF (inverse document frequency) of the word `"u"` and of word `"university"`?

print(tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])

# To transform the entire bag-of-words corpus into TF-IDF corpus at once:

messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

# There are many ways the data can be preprocessed and vectorized. These steps involve feature engineering and building a "pipeline". I encourage you to check out SciKit Learn's documentation on dealing with text data as well as the expansive collection of available papers and books on the general topic of NLP.

# ## Training a model
    return [
        w for w in nopunc.split()
        if w.lower() not in stopwords.words("english")
    ]


bow_transformer = CountVectorizer(analyzer=text_process).fit(
    messages["message"])
print(len(bow_transformer.vocabulary_))

messages_bow = bow_transformer.transform(messages["message"])

print("shape of the Sparse Matrix:", messages_bow.shape)

tfidf_trans = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_trans.transform(messages_bow)

spam_detect = MultinomialNB().fit(messages_tfidf, messages["label"])

print(spam_detect.predict(messages_tfidf[4])[0])

msg_train, msg_test, la_train, la_test = train_test_split(messages["message"],
                                                          messages["label"],
                                                          test_size=.3)

pipes = Pipeline([("bow", CountVectorizer(analyzer=text_process)),
                  ("tfidf", TfidfTransformer()),
                  ("classifier", MultinomialNB())])

pipes.fit(msg_train, la_train)
Exemple #33
0
    return (data, target)


X, y = get_data(DATA_DIR)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.10,
                                                    random_state=42)

count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)
#clf = SGDClassifier(tol=None, n_jobs=-1).fit(X_train_tfidf, y_train)

X_test_counts = count_vec.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

text_clf = Pipeline([('count_vec', CountVectorizer()),
                     ('tfidf_transformer', TfidfTransformer()),
                     ('clf', SGDClassifier(tol=None,
                                           n_jobs=-1))]).fit(X_train, y_train)

y_pred = clf.predict(X_test_tfidf)
y_pred = text_clf.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print(metrics.jaccard_score(y_test, y_pred))
Exemple #34
0
def run_classifer(X_train, s_train, y_train, X_test, s_test, y_test):
    s_train = np.array(s_train)  # samples x features
    s_test = np.array(s_test)

    num_labels = 15
    batch_size = 100

    stemmer = sb.SnowballStemmer('english')

    swlist = sw.words('english')
    swlist += [stemmer.stem(w) for w in swlist]
    swlist += [
        "'d", "'s", 'abov', 'ani', 'becaus', 'befor', 'could', 'doe', 'dure',
        'might', 'must', "n't", 'need', 'onc', 'onli', 'ourselv', 'sha',
        'themselv', 'veri', 'whi', 'wo', 'would', 'yourselv'
    ]  #complained about not having these as stop words
    pubs = [
        'buzzfe', 'buzzf', 'npr', 'cnn', 'vox', 'reuter', 'breitbart', 'fox',
        'guardian', 'review', 'theatlant'
    ]
    punct = [
    ]  #[':', '..', '“', '@', '%', ';', '→', ')', '#', '(', '*', '&', '[', ']', '…', '?','—', '‘', '$'] #gonna leave these in for now

    swlist += pubs
    swlist += punct
    if sys.argv[4].lower() == 'true':
        tkzr = StemTokenizer()
    else:
        tkzr = None

    if sys.argv[5].lower() != 'true':
        swlist = []

    #what features are we using?
    if sys.argv[7].lower() == 'word':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)
        tfidf_transformer = TfidfTransformer()
        tfidf_transformer.fit(X_train)
        X_train = tfidf_transformer.transform(X_train)
        X_test = tfidf_transformer.transform(X_test)

    elif sys.argv[7].lower() == 'topic':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)
        lda_model = LatentDirichletAllocation(n_components=10)
        lda_model.fit(X_train)
        X_train = lda_model.transform(X_train)
        X_test = lda_model.transform(X_test)

    elif sys.argv[7].lower() == 'style':
        X_train = csr_matrix(s_train)
        X_test = csr_matrix(s_test)

    elif sys.argv[7].lower() == 'all':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)

        tfidf_transformer = TfidfTransformer()
        tfidf_transformer.fit(X_train)
        X_train_tf = tfidf_transformer.transform(X_train)
        X_test_tf = tfidf_transformer.transform(X_test)
        print(type(X_train_tf))

        lda_model = LatentDirichletAllocation(n_components=10)
        lda_model.fit(X_train)
        X_train_lda = lda_model.transform(X_train)
        X_test_lda = lda_model.transform(X_test)
        print(type(X_train_lda))

        X_train = csr_matrix(
            sparse.hstack(
                [X_train_tf,
                 csr_matrix(X_train_lda),
                 csr_matrix(s_train)]))
        X_test = csr_matrix(
            sparse.hstack(
                [X_test_tf,
                 csr_matrix(X_test_lda),
                 csr_matrix(s_test)]))

        print(type(X_train))

        # sparse.save_npz("X_train" + sys.argv[6] + ".npz", X_train)
        # sparse.save_npz("X_test" + sys.argv[6] + ".npz", X_test)

    else:
        sys.exit('unknown features')

    encoder = LabelBinarizer()
    encoder.fit(y_train)
    y_train = encoder.transform(y_train)
    y_test = encoder.transform(y_test)

    # np.save('X_train.npy', X_train)
    # np.save('X_test.npy', X_test)
    # np.save('y_train.npy', y_train)
    # np.save('y_test.npy', y_test)

    # sparse.save_npz("y_train" + sys.argv[6] + ".npz", y_train)
    # sparse.save_npz("y_test" + sys.argv[6] + ".npz", y_test)

    # load everything back
    # X_train = sparse.load_npz("X_train.npz")

    input_dim = X_train.shape[1]
    model = Sequential()
    model.add(Dense(512, input_shape=(input_dim, )))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    history = model.fit(X_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=5,
                        verbose=1,
                        validation_split=0.1)

    # model.model.save(sys.argv[6] + '.h5')

    # X_train = np.load('X_train.npy')
    # X_test = np.load('X_test.npy')
    # y_train = np.load('y_train.npy')
    # y_test = np.load('y_test.npy')

    # model = keras.models.load_model(sys.argv[6] + '.h5')
    score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)

    print('Test accuracy:', score[1])

    y_pred = model.predict(X_test, batch_size=batch_size, verbose=1)
    predicted = np.argmax(y_pred, axis=1)
    p, r, fs, s = precision_recall_fscore_support(np.argmax(y_test, axis=1),
                                                  predicted)
    print(p, r, fs, s)
Exemple #35
0

#Sparsity: comparing non zero vs total number of messages 



sparsity = (100.0 * message_bow.nnz / (message_bow.shape[0] * message_bow.shape[1]))
print('sparsity: {}'.format((sparsity)))




from sklearn.feature_extraction.text import TfidfTransformer #calculating the Term frequency and Inverse document frequency
        
tfidf_transformer = TfidfTransformer().fit(message_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

messages_tfidf=tfidf_transformer.transform(message_bow) #calculating TF-IDF for bag of words

#Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, messages['label'])


#Detection filter using Naive Bayes for Label 2
print('predicted:', spam_detect_model.predict(tfidf4)[0])
print('expected:', messages.label[2])

#Model Evaluation of all
all_predictions = spam_detect_model.predict(messages_tfidf)
Exemple #36
0
    # print(tf_matrix_2.shape)

    # print("\nCalculating inverse document frequency (IDF) matrices")
    # Each vector's component is now the idf for each term
    tfidfTran = TfidfTransformer(norm="l2")
    tfidfTran.fit(tf_matrix)
    # print(tfidfTran.idf_)

    # Manually verify that the IDF is correct
    # print("The idf for terms that appear in one document: " + str(idf(2,1)))
    # print("The idf for terms that appear in two documents: " + str(idf(2,2)))

    # print("\nCreating the TF-IDF matrices")
    # Transform method here multiples the tf matrix by the diagonal idf matrix
    # The method then divides the tf-idf matrix by the Euclidean norm
    tfidf_matrix = tfidfTran.transform(tf_matrix)
    # print(tfidf_matrix.toarray())

    # print("\nCreating the cosine similarity matrices")
    # Multiply matrix by transpose to get final result
    cos_similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()
    print("#: {}, score: {}".format(i+1, cos_similarity_matrix[0][1])) 

    group_set[i]['similarity'] = cos_similarity_matrix[0][1]

    # Calculating average and appending it to group_set
    # print("\nCalculating averages")
    # num_reviews = len(current_set)
    # for i in range(num_reviews):
    #     # i is the index of the focal review
    #     # cos_similarity_matrix[i] contains the array of similarity scores for the focal review 
Exemple #37
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert counts_train[0, v1.vocabulary_["pizza"]] == 2

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert counts_test[0, vocabulary["salad"]] == 1
        assert counts_test[0, vocabulary["tomato"]] == 1
        assert counts_test[0, vocabulary["water"]] == 1

        # stop word from the fixed list
        assert "the" not in vocabulary

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert "copyright" not in vocabulary

        # not present in the sample
        assert counts_test[0, vocabulary["coke"]] == 0
        assert counts_test[0, vocabulary["burger"]] == 0
        assert counts_test[0, vocabulary["beer"]] == 0
        assert counts_test[0, vocabulary["pizza"]] == 0

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert len(t1.idf_) == len(v1.vocabulary_)
    assert tfidf.shape == (n_train, len(v1.vocabulary_))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert tfidf_test.shape == (len(test_data), len(v1.vocabulary_))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert not hasattr(t2, "idf_")

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    with pytest.raises(ValueError):
        t3.transform(counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5],
         [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3],
                 [1, 3]]
    with pytest.raises(ValueError):
        t3.transform(X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert not tv.fixed_vocabulary_
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    with pytest.raises(ValueError):
        v3.transform(train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    processor = v3.build_preprocessor()
    text = ("J'ai mangé du kangourou  ce midi, "
            "c'était pas très bon.")
    expected = strip_accents_ascii(text)
    result = processor(text)
    assert expected == result

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    with pytest.raises(ValueError):
        v3.build_preprocessor()

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    with pytest.raises(ValueError):
        v3.build_analyzer()
Exemple #38
0
    def build(self, texts: list, topics: list):
        token_pattern = r"(?u)\b\w\w+\b"
        vectorizer = CountVectorizer(
            token_pattern=token_pattern,
            stop_words=get_stop_words('russian')
        )
        X = vectorizer.fit_transform(texts)
        transformer = TfidfTransformer()
        X = transformer.fit_transform(X)

        if self._vectorizer_path is not None:
            with open(self._vectorizer_path, 'w') as f:
                for key, value in vectorizer.vocabulary_.items():
                    f.write("%s " % key)
                    f.write("%s \n" % value)

        classifier = SGDClassifier(
            loss="log", class_weight='balanced',
            penalty='l1', alpha=0.0000009, n_jobs=-1
        )

        if self._test_quality_iters is not None:
            for i in range(self._test_quality_iters):
                X_train, X_test, y_train, y_test = train_test_split(X, topics, test_size=0.33)
                classifier.fit(X_train, y_train)
                predicted = classifier.predict(X_test)
                print('Accuracy', np.mean(predicted == y_test))

        if self._X_test is not None:
            X_test = self._X_test
            y_test = self._y_test

            classifier.fit(X, topics)
            sorted_topics = np.unique(topics)
            X_transformed = transformer.transform(vectorizer.transform(X_test))
            predicted = classifier.predict(X_transformed)
            print('Accuracy on real tests:', np.mean(predicted == y_test))
            # vocabulary = vectorizer.get_feature_names() : for human friendly features
            if self._tests_path is not None:
                with open(self._tests_path, 'w') as t:
                    t.write("%s %s\n" % (self._tests_num, X.shape[1]))

                    for index in range(len(X_test)):
                        doc = X_transformed[index]
                        probs = classifier.predict_proba(doc)
                        for item in probs[0]:
                            t.write("%s " % item)
                        t.write("\n")

                        orig_doc = X_test[index]
                        t.write("%s \n" % orig_doc)
                        for item in doc.toarray()[0]:
                            t.write("%s " % item)
                        t.write("\n")

                        print(orig_doc)
                        pred_topics = {}
                        for i in range(len(probs[0])):
                            probability = probs[0][i]
                            topic = sorted_topics[i]
                            pred_topics[topic] = probability

                        print(sorted(pred_topics.items(), key=lambda kv: kv[1], reverse=True))
                        print("______")

        if self._weights_path is not None or self._tests_path is not None:
            with open(self._weights_path, 'w') as f:
                f.write("%s " % classifier.coef_.shape[0])  # amount of classes
                f.write("%s \n" % classifier.coef_.shape[1])  # amount of features
                for line in classifier.classes_:
                    f.write("%s \n" % line)
                for line in classifier.coef_:
                    for index, item in enumerate(line):
                        if item != 0.0:
                            f.write("%s %s " % (index, item))
                    f.write("\n")
                for item in classifier.intercept_:
                    f.write("%s " % item)

        return classifier