def load_best_classifier_conf(): from DataClassifierV2 import ClassifiersWrapper from pyspark.mllib.classification import SVMWithSGD, LogisticRegressionWithSGD, LogisticRegressionWithLBFGS, NaiveBayes myClassifier = ClassifiersWrapper() myClassifier.addClassifier(classifier=SVMWithSGD, trainParameters={}, weight=0.3) myClassifier.addClassifier(classifier=LogisticRegressionWithSGD, trainParameters={}, weight=0.3) myClassifier.addClassifier(classifier=NaiveBayes, trainParameters={}, weight=0.3) myClassifier.addClassifier(classifier=LogisticRegressionWithLBFGS, trainParameters={}, weight=0.7) return myClassifier
newSource.lookingAll('NASDAQ:GOOGL', ['GOOG', 'GOOGL', 'GOOGLE']) newSource.lookingAll('NASDAQ:NVDA', ['NVIDIA']) newSource.lookingAll('VTX:NESN', ['NESTLE']) newSource.lookingAll('VTX:SCMN', ['SWISSCOM']) newSource.lookingAll('VTX:NOVN', ['NOVARTIS']) newsRDD = newSource.doIt() marketSource = GoogleFinanceMarketSourceSpark(['NASDAQ:GOOGL', 'NASDAQ:NVDA', 'VTX:NESN', 'VTX:SCMN', 'VTX:NOVN']) newsRDD = newsRDD.map(lambda x: marketSource.addMarketStatusToNews(x)) #newsRDD = newsRDD.randomSplit([1,1,1,1,1,1,1,1,1,1,1,1,1,1,1])[0] newsRDD.cache() print('nb news : %d' % newsRDD.count()) dataSetMaker = DataSetMakerV2(n=200000) fullDataSet = dataSetMaker.processBinary(newsRDD) fullDataSet.cache() myClassifier = ClassifiersWrapper() myClassifier.addClassifier(classifier=SVMWithSGD, trainParameters={}, weight=0.3) myClassifier.addClassifier(classifier=LogisticRegressionWithSGD, trainParameters={}, weight=0.3) myClassifier.addClassifier(classifier=NaiveBayes, trainParameters={}, weight=0.3) myClassifier.addClassifier(classifier=LogisticRegressionWithLBFGS, trainParameters={}, weight=0.7) myClassifier2 = ClassifiersWrapper() myClassifier2.addClassifier(classifier=SVMWithSGD, trainParameters={}, weight=0.3) myClassifier2.addClassifier(classifier=LogisticRegressionWithSGD, trainParameters={}, weight=0.3) myClassifier2.addClassifier(classifier=NaiveBayes, trainParameters={}, weight=0.3) myClassifier2.addClassifier(classifier=LogisticRegressionWithLBFGS, trainParameters={}, weight=0.7) dataClassifierEvaluator = DataClassifierEvaluator(fullDataSet) #dataClassifierEvaluator.addModel(myClassifier, 'My Classifier') myClassifierOnevsOne = DataClassifierMultiClassesOneVsOne(myClassifier, 4) myClassifierOnevsMany = DataClassifierMultiClassesOneVsMany(myClassifier2, 4)
#newSource.lookingAll('NASDAQ:NVDA', ['NVIDIA']) #newSource.lookingAll('VTX:NESN', ['NESTLE']) #newSource.lookingAll('VTX:SCMN', ['SWISSCOM']) #newSource.lookingAll('VTX:NOVN', ['NOVARTIS']) newsRDD = newSource.doIt() marketSource = GoogleFinanceMarketSourceSpark( ['NASDAQ:GOOGL', 'NASDAQ:NVDA', 'VTX:NESN', 'VTX:SCMN', 'VTX:NOVN']) newsRDD = newsRDD.map(lambda x: marketSource.addMarketStatusToNews(x)) #newsRDD = newsRDD.randomSplit([1,1,1,1,1,1,1,1,1,1,1,1,1,1,1])[0] newsRDD.cache() print('nb news : %d' % newsRDD.count()) dataSetMaker = DataSetMakerV2(n=config.FEATURES_CONF['vecteur_size']) fullDataSet = dataSetMaker.process(newsRDD) # TODO change fullDataSet.cache() myClassifier = ClassifiersWrapper() myClassifier.addClassifier(classifier=SVMWithSGD, trainParameters={}, weight=0.4) #myClassifier.addClassifier(classifier=LogisticRegressionWithSGD, trainParameters={}, weight=0.3) myClassifier.addClassifier(classifier=NaiveBayes, trainParameters={}, weight=0.4) myClassifier.addClassifier(classifier=LogisticRegressionWithLBFGS, trainParameters={}, weight=0.4) myClassifier2 = ClassifiersWrapper() myClassifier2.addClassifier(classifier=SVMWithSGD, trainParameters={}, weight=0.3)