コード例 #1
0
def main(sc):
    start = timer()

    categs = ["Computers & Tablets", "Video Games", "TV & Home Theater"]# , "Musical Instruments"]

    stpwrds = stopwords.words('english')
    tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N'))

    productRDD = sc.parallelize(findProductsByCategory(categs))
    corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3]))
						   .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] ))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3]))
                           .cache())

    idfsRDD = idfs(corpusRDD)
    idfsRDDBroadcast = sc.broadcast(idfsRDD.collectAsMap())
    tfidfRDD = corpusRDD.map(lambda x: (x[0], tfidf(x[1], idfsRDDBroadcast.value), x[2], x[3]))

    category = productRDD.map(lambda x: x[2]).distinct().collect()
    categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect()
    tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect()


    insertTokensAndCategories(tokens, category, categoryAndSubcategory)
    
    classifier = Classifier(sc, 'NaiveBayes')
    trainingVectSpaceCategoryRDD, testVectSpaceCategoryRDD = classifier.createVectSpaceCategory(tfidfRDD, category, tokens).randomSplit([8, 2], seed=0L)
    modelNaiveBayesCategory = classifier.trainModel(trainingVectSpaceCategoryRDD, '/dados/models/naivebayes/category_new')

    predictionAndLabelCategoryRDD = testVectSpaceCategoryRDD.map(lambda p : (category[int(modelNaiveBayesCategory.predict(p.features))], category[int(p.label)]))
    acuraccyCategory = float(predictionAndLabelCategoryRDD.filter(lambda (x, v): x[0] == v[0]).count())/float(predictionAndLabelCategoryRDD.count())
    print 'the accuracy of the Category Naive Bayes model is %f' % acuraccyCategory

    #training in this second way just for test
    
    trainingVectSpaceSubcategory, testVectSpaceSubcategory = classifier.createVectSpaceSubcategory(tfidfRDD, categoryAndSubcategory, tokens).randomSplit([8, 2], seed=0L)
    modelNaiveBayesSubcategory = classifier.trainModel(trainingVectSpaceSubcategory, '/dados/models/naivebayes/subcategory_new')

    predictionAndLabelSubcategory = testVectSpaceSubcategory.map(lambda p : (categoryAndSubcategory[int(modelNaiveBayesSubcategory.predict(p.features))], categoryAndSubcategory[int(p.label)]))
    acuraccySubcategory = float(predictionAndLabelSubcategory.filter(lambda (x, v): x[0] == v[0]).count())/float(predictionAndLabelSubcategory.count())
    print 'the accuracy of the Subcategory Naive Bayes model is %f' % acuraccySubcategory

    #test with DecisionTree Model
    classifierDT = Classifier(sc, 'DecisionTree')
    trainingVectSpaceCategory, testVectSpaceCategory = classifierDT.createVectSpaceCategory(tfidfRDD, category, tokens).randomSplit([8, 2], seed=0L)
    modelDecisionTreeCategory = classifierDT.trainModel(trainingVectSpaceCategory, '/dados/models/dt/category_new')

    predictions = modelDecisionTreeCategory.predict(testVectSpaceCategory.map(lambda x: x.features))
    predictionAndLabelCategory = testVectSpaceCategory.map(lambda lp: lp.label).zip(predictions)
    acuraccyDecisionTree = float(predictionAndLabelCategory.filter(lambda (x, v): x == v).count())/float(predictionAndLabelCategory.count())   
    print 'the accuracy of the Decision Tree model is %f' % acuraccyDecisionTree

    elap = timer()-start
    print 'it tooks %d seconds' % elap
コード例 #2
0
def main(sc):
    

    iduser = 1
    posts = [
                (u'post1', u'I love computers! i would like to buy an asus notebook.', u'Post', u'Twitter'),
                (u'post2', u'My tablet is not working anymore, i need to buy a new one', u'Post', u'Facebook'),
                (u'post3', u'I love to watch TV on saturday nights! ', u'Post', u'Twitter'),
                (u'post4', u'i love to watch netflix on my smart tv', u'Post', u'Twitter'),
                (u'post5', u'The #Kindle2 seems the best eReader, but will it work in the UK and where can I get one?', u'Post', u'Facebook'),
                (u'post6', u'I still love my Kindle2 but reading The New York Times on it does not feel natural. I miss the Bloomingdale ads.', u'Post', u'Facebook')
            ]

    postsRDD = sc.parallelize(posts)
    tokens, category, categoryAndSubcategory = getTokensAndCategories()

    stpwrds = stopwords.words('english')
    tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N'))

    productRDD = sc.parallelize(findProductsByCategory(category))

    productAndPostRDD = productRDD.union(postsRDD)
    
    corpusRDD = (productAndPostRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3]))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3]))
                           .map(lambda s: (s[0], [x for x in s[1] if x in tokens], s[2], s[3]))
                           .filter(lambda x: len(x[1]) >= 20 or x[2] == u'Post')
                           .cache())

    idfsRDD = idfs(corpusRDD)
    idfsRDDBroadcast = sc.broadcast(idfsRDD.collectAsMap())
    tfidfRDD = corpusRDD.map(lambda x: (x[0], tfidf(x[1], idfsRDDBroadcast.value), x[2], x[3])).cache()
    
    tfidfPostsRDD = tfidfRDD.filter(lambda x: x[2]=='Post').cache()
    tfidfPostsBroadcast = sc.broadcast(tfidfPostsRDD.map(lambda x: (x[0], x[1])).collectAsMap())
    corpusPostsNormsRDD = tfidfPostsRDD.map(lambda x: (x[0], norm(x[1]))).cache()
    corpusPostsNormsBroadcast = sc.broadcast(corpusPostsNormsRDD.collectAsMap())

    classifier = Classifier(sc, 'NaiveBayes')
    #classifierDT = Classifier(sc, 'DecisionTree')
    #modelNaiveBayesCategory = classifier.getModel('/dados/models/naivebayes/category_new')
    modelNaiveBayesSubcategory = classifier.getModel('/dados/models/naivebayes/subcategory_new')
    #modelDecisionTree = classifierDT.getModel('/dados/models/dt/category_new')

    postsSpaceVectorRDD = classifier.createVectSpacePost(tfidfPostsRDD, tokens)     
    #predictionCategoryNaiveBayesCategoryRDD = postsSpaceVectorRDD.map(lambda p: modelNaiveBayesCategory.predict(p))
    #predictionCategoryDecisionTreeRDD = modelDecisionTree.predict(postsSpaceVectorRDD.map(lambda x: x))
    predictions = postsSpaceVectorRDD.map(lambda p: (modelNaiveBayesSubcategory.predict(p[1]), p[0])).groupByKey().mapValues(list).collect()     

    for prediction in predictions:

        category_to_use = categoryAndSubcategory[int(prediction[0])][0]

        tfidfProductsCategoryRDD = tfidfRDD.filter(lambda x: x[2]==category_to_use).cache()
        tfidfProductsCategoryBroadcast = sc.broadcast(tfidfProductsCategoryRDD.map(lambda x: (x[0], x[1])).collectAsMap())

        corpusInvPairsProductsRDD = tfidfProductsCategoryRDD.flatMap(lambda r: ([(x, r[0]) for x in r[1]])).cache()
        corpusInvPairsPostsRDD = tfidfPostsRDD.flatMap(lambda r: ([(x, r[0]) for x in r[1]])).filter(lambda x: x[1] in prediction[1]).cache()
        commonTokens = (corpusInvPairsProductsRDD.join(corpusInvPairsPostsRDD)
                                                 .map(lambda x: (x[1], x[0]))
                                                 .groupByKey()
                                                 .cache())

        corpusProductsNormsRDD = tfidfProductsCategoryRDD.map(lambda x: (x[0], norm(x[1]))).cache()
        corpusProductsNormsBroadcast = sc.broadcast(corpusProductsNormsRDD.collectAsMap())

        similaritiesRDD =  (commonTokens
                            .map(lambda x: cosineSimilarity(x, tfidfProductsCategoryBroadcast.value, tfidfPostsBroadcast.value, corpusProductsNormsBroadcast.value, corpusPostsNormsBroadcast.value))
                            .cache())

        suggestions = (similaritiesRDD
                        .map(lambda x: (x[0][1], (x[0][0], x[1])))
                        .filter(lambda x: x[1][1]>threshold)
                        .groupByKey()
                        .mapValues(list)
                        .join(postsRDD)
                        .join(postsRDD.map(lambda x: (x[0], x[3])))
                        .collect())

        if len(suggestions) > 0:
            insertSuggestions(suggestions, iduser, productRDD)


    elap = timer()-start
    print 'it tooks %d seconds' % elap
import mlflow
import pandas as pd
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from classes.Vectorizer import Vectorizer
from classes.Classifier import Classifier
from classes.Resample import Resample
from classes.ModelBuilder import ModelBuilder
from classes.Constans import *

# In[ ]:
classifier = Classifier()
vectorizer = Vectorizer()
resample = Resample()
builder = ModelBuilder()
classifier_list = []

# ### Get Info from CSV

# In[ ]:
df_train = shuffle(pd.read_csv('data/train_preprocessed.csv', sep='|'))
df_test = shuffle(
    pd.read_csv('data/test_santander.csv', usecols=['id', 'Pregunta']))
print(df_train['Intencion_cat_label'].value_counts())
# add one more sample because I have one case with just one sample and stratify need at least 2 samples
df_train = resample.apply_resample(df_train, 'Pregunta', 5, 100)
コード例 #4
0
# In[ ]:
import sys
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from classes.Vectorizer import Vectorizer
from classes.Classifier import Classifier
from classes.Resample import Resample
from classes.ModelBuilder import ModelBuilder
from classes.Constans import *

# In[ ]:
classifier = Classifier()
vectorizer = Vectorizer()
resample = Resample()
builder = ModelBuilder()

#--------------------------------------------------#
###                  CLASSIFIERS                 ###
#--------------------------------------------------#
classifier_list = classifier.get_classifier_list()


def build_model(X, y, model, df_test):
    model_name = model.__class__.__name__
    X_train, X_test, y_train, y_test = builder.get_train_test_split(X, y)
    RESAMPLE_FILE = 'data/apply_resample_after_{}.png'.format(model_name)
    if APPLY_RESAMPLE == True: