Python Classifierの例

プログラミング言語: Python

名前空間/パッケージ名: classes.Classifier

クラス/型: Classifier

hotexamples.comのコード掲載数: 4

Python Classifier - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのclasses.Classifier.Classifierの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Classifier(2)

createVectSpaceCategory(1)

createVectSpacePost(1)

createVectSpaceSubcategory(1)

getModel(1)

get_classifier_list(1)

get_stacking(1)

trainModel(1)

コード例 #1

ファイルを表示

ファイル: train_classifier.py プロジェクト: janes/experiment_ic_productsimilarity

def main(sc):
    start = timer()

    categs = ["Computers & Tablets", "Video Games", "TV & Home Theater"]# , "Musical Instruments"]

    stpwrds = stopwords.words('english')
    tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N'))

    productRDD = sc.parallelize(findProductsByCategory(categs))
    corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3]))
						   .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] ))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3]))
                           .cache())

    idfsRDD = idfs(corpusRDD)
    idfsRDDBroadcast = sc.broadcast(idfsRDD.collectAsMap())
    tfidfRDD = corpusRDD.map(lambda x: (x[0], tfidf(x[1], idfsRDDBroadcast.value), x[2], x[3]))

    category = productRDD.map(lambda x: x[2]).distinct().collect()
    categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect()
    tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect()


    insertTokensAndCategories(tokens, category, categoryAndSubcategory)
    
    classifier = Classifier(sc, 'NaiveBayes')
    trainingVectSpaceCategoryRDD, testVectSpaceCategoryRDD = classifier.createVectSpaceCategory(tfidfRDD, category, tokens).randomSplit([8, 2], seed=0L)
    modelNaiveBayesCategory = classifier.trainModel(trainingVectSpaceCategoryRDD, '/dados/models/naivebayes/category_new')

    predictionAndLabelCategoryRDD = testVectSpaceCategoryRDD.map(lambda p : (category[int(modelNaiveBayesCategory.predict(p.features))], category[int(p.label)]))
    acuraccyCategory = float(predictionAndLabelCategoryRDD.filter(lambda (x, v): x[0] == v[0]).count())/float(predictionAndLabelCategoryRDD.count())
    print 'the accuracy of the Category Naive Bayes model is %f' % acuraccyCategory

    #training in this second way just for test
    
    trainingVectSpaceSubcategory, testVectSpaceSubcategory = classifier.createVectSpaceSubcategory(tfidfRDD, categoryAndSubcategory, tokens).randomSplit([8, 2], seed=0L)
    modelNaiveBayesSubcategory = classifier.trainModel(trainingVectSpaceSubcategory, '/dados/models/naivebayes/subcategory_new')

    predictionAndLabelSubcategory = testVectSpaceSubcategory.map(lambda p : (categoryAndSubcategory[int(modelNaiveBayesSubcategory.predict(p.features))], categoryAndSubcategory[int(p.label)]))
    acuraccySubcategory = float(predictionAndLabelSubcategory.filter(lambda (x, v): x[0] == v[0]).count())/float(predictionAndLabelSubcategory.count())
    print 'the accuracy of the Subcategory Naive Bayes model is %f' % acuraccySubcategory

    #test with DecisionTree Model
    classifierDT = Classifier(sc, 'DecisionTree')
    trainingVectSpaceCategory, testVectSpaceCategory = classifierDT.createVectSpaceCategory(tfidfRDD, category, tokens).randomSplit([8, 2], seed=0L)
    modelDecisionTreeCategory = classifierDT.trainModel(trainingVectSpaceCategory, '/dados/models/dt/category_new')

    predictions = modelDecisionTreeCategory.predict(testVectSpaceCategory.map(lambda x: x.features))
    predictionAndLabelCategory = testVectSpaceCategory.map(lambda lp: lp.label).zip(predictions)
    acuraccyDecisionTree = float(predictionAndLabelCategory.filter(lambda (x, v): x == v).count())/float(predictionAndLabelCategory.count())   
    print 'the accuracy of the Decision Tree model is %f' % acuraccyDecisionTree

    elap = timer()-start
    print 'it tooks %d seconds' % elap

コード例 #2

ファイルを表示

ファイル: make_prediction.py プロジェクト: felipecontra3/experiment_ic_productsimilarity

def main(sc):
    

    iduser = 1
    posts = [
                (u'post1', u'I love computers! i would like to buy an asus notebook.', u'Post', u'Twitter'),
                (u'post2', u'My tablet is not working anymore, i need to buy a new one', u'Post', u'Facebook'),
                (u'post3', u'I love to watch TV on saturday nights! ', u'Post', u'Twitter'),
                (u'post4', u'i love to watch netflix on my smart tv', u'Post', u'Twitter'),
                (u'post5', u'The #Kindle2 seems the best eReader, but will it work in the UK and where can I get one?', u'Post', u'Facebook'),
                (u'post6', u'I still love my Kindle2 but reading The New York Times on it does not feel natural. I miss the Bloomingdale ads.', u'Post', u'Facebook')
            ]

    postsRDD = sc.parallelize(posts)
    tokens, category, categoryAndSubcategory = getTokensAndCategories()

    stpwrds = stopwords.words('english')
    tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N'))

    productRDD = sc.parallelize(findProductsByCategory(category))

    productAndPostRDD = productRDD.union(postsRDD)
    
    corpusRDD = (productAndPostRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3]))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3]))
                           .map(lambda s: (s[0], [x for x in s[1] if x in tokens], s[2], s[3]))
                           .filter(lambda x: len(x[1]) >= 20 or x[2] == u'Post')
                           .cache())

    idfsRDD = idfs(corpusRDD)
    idfsRDDBroadcast = sc.broadcast(idfsRDD.collectAsMap())
    tfidfRDD = corpusRDD.map(lambda x: (x[0], tfidf(x[1], idfsRDDBroadcast.value), x[2], x[3])).cache()
    
    tfidfPostsRDD = tfidfRDD.filter(lambda x: x[2]=='Post').cache()
    tfidfPostsBroadcast = sc.broadcast(tfidfPostsRDD.map(lambda x: (x[0], x[1])).collectAsMap())
    corpusPostsNormsRDD = tfidfPostsRDD.map(lambda x: (x[0], norm(x[1]))).cache()
    corpusPostsNormsBroadcast = sc.broadcast(corpusPostsNormsRDD.collectAsMap())

    classifier = Classifier(sc, 'NaiveBayes')
    #classifierDT = Classifier(sc, 'DecisionTree')
    #modelNaiveBayesCategory = classifier.getModel('/dados/models/naivebayes/category_new')
    modelNaiveBayesSubcategory = classifier.getModel('/dados/models/naivebayes/subcategory_new')
    #modelDecisionTree = classifierDT.getModel('/dados/models/dt/category_new')

    postsSpaceVectorRDD = classifier.createVectSpacePost(tfidfPostsRDD, tokens)     
    #predictionCategoryNaiveBayesCategoryRDD = postsSpaceVectorRDD.map(lambda p: modelNaiveBayesCategory.predict(p))
    #predictionCategoryDecisionTreeRDD = modelDecisionTree.predict(postsSpaceVectorRDD.map(lambda x: x))
    predictions = postsSpaceVectorRDD.map(lambda p: (modelNaiveBayesSubcategory.predict(p[1]), p[0])).groupByKey().mapValues(list).collect()     

    for prediction in predictions:

        category_to_use = categoryAndSubcategory[int(prediction[0])][0]

        tfidfProductsCategoryRDD = tfidfRDD.filter(lambda x: x[2]==category_to_use).cache()
        tfidfProductsCategoryBroadcast = sc.broadcast(tfidfProductsCategoryRDD.map(lambda x: (x[0], x[1])).collectAsMap())

        corpusInvPairsProductsRDD = tfidfProductsCategoryRDD.flatMap(lambda r: ([(x, r[0]) for x in r[1]])).cache()
        corpusInvPairsPostsRDD = tfidfPostsRDD.flatMap(lambda r: ([(x, r[0]) for x in r[1]])).filter(lambda x: x[1] in prediction[1]).cache()
        commonTokens = (corpusInvPairsProductsRDD.join(corpusInvPairsPostsRDD)
                                                 .map(lambda x: (x[1], x[0]))
                                                 .groupByKey()
                                                 .cache())

        corpusProductsNormsRDD = tfidfProductsCategoryRDD.map(lambda x: (x[0], norm(x[1]))).cache()
        corpusProductsNormsBroadcast = sc.broadcast(corpusProductsNormsRDD.collectAsMap())

        similaritiesRDD =  (commonTokens
                            .map(lambda x: cosineSimilarity(x, tfidfProductsCategoryBroadcast.value, tfidfPostsBroadcast.value, corpusProductsNormsBroadcast.value, corpusPostsNormsBroadcast.value))
                            .cache())

        suggestions = (similaritiesRDD
                        .map(lambda x: (x[0][1], (x[0][0], x[1])))
                        .filter(lambda x: x[1][1]>threshold)
                        .groupByKey()
                        .mapValues(list)
                        .join(postsRDD)
                        .join(postsRDD.map(lambda x: (x[0], x[3])))
                        .collect())

        if len(suggestions) > 0:
            insertSuggestions(suggestions, iduser, productRDD)


    elap = timer()-start
    print 'it tooks %d seconds' % elap

コード例 #3

ファイルを表示

ファイル: Santander-Train Ensemble.py プロジェクト: joseferrercba/FundacionsadoskySantander

import mlflow
import pandas as pd
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from classes.Vectorizer import Vectorizer
from classes.Classifier import Classifier
from classes.Resample import Resample
from classes.ModelBuilder import ModelBuilder
from classes.Constans import *

# In[ ]:
classifier = Classifier()
vectorizer = Vectorizer()
resample = Resample()
builder = ModelBuilder()
classifier_list = []

# ### Get Info from CSV

# In[ ]:
df_train = shuffle(pd.read_csv('data/train_preprocessed.csv', sep='|'))
df_test = shuffle(
    pd.read_csv('data/test_santander.csv', usecols=['id', 'Pregunta']))
print(df_train['Intencion_cat_label'].value_counts())
# add one more sample because I have one case with just one sample and stratify need at least 2 samples
df_train = resample.apply_resample(df_train, 'Pregunta', 5, 100)

コード例 #4

ファイルを表示

ファイル: Santander-Train Model.py プロジェクト: joseferrercba/FundacionsadoskySantander

# In[ ]:
import sys
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from classes.Vectorizer import Vectorizer
from classes.Classifier import Classifier
from classes.Resample import Resample
from classes.ModelBuilder import ModelBuilder
from classes.Constans import *

# In[ ]:
classifier = Classifier()
vectorizer = Vectorizer()
resample = Resample()
builder = ModelBuilder()

#--------------------------------------------------#
###                  CLASSIFIERS                 ###
#--------------------------------------------------#
classifier_list = classifier.get_classifier_list()


def build_model(X, y, model, df_test):
    model_name = model.__class__.__name__
    X_train, X_test, y_train, y_test = builder.get_train_test_split(X, y)
    RESAMPLE_FILE = 'data/apply_resample_after_{}.png'.format(model_name)
    if APPLY_RESAMPLE == True: