def crossvalidation(self, nbSplits=5): ''' return precision min, precision max, moyenne des precisions ''' precMin = 1.0 precMax = 0.0 prec = 0.0 listPrec = [] aSplit = [1 for x in range(0,nbSplits)] rdds = self.dataset.randomSplit(aSplit,random.randint(1,100000)) for cpt in range(0, nbSplits): MessageManager.debugMessage("DataClassifier : start new cross-validation iteration %d/%d" % (cpt, nbSplits)) trainSet ,testSet = self._giveTrainAndtest(rdds, cpt) #one = trainSet.take(1)[0] #print('size of vect : %d' % len(one.features)) print('trainset size : %d' % trainSet.count()) print('testset size : %d' % testSet.count()) # cheat because we can't use self.predict in a map here... toto = DataClassifier(None,self.classifier) toto.train(trainSet) #TODO test here evaluatorRdd = testSet.map(lambda p: (p.label, toto.predict(p.features))) #evaluatorRdd = testSet.map(lambda p: (p.label, 1)) print('evaluatorRdd size : %d' % evaluatorRdd.count()) rddOK = evaluatorRdd.filter(lambda (a, b): a == b) nbOK = rddOK.count() nbTOT = testSet.count() prec = nbOK/float(nbTOT) if(prec < precMin): precMin = prec if(prec > precMax): precMax = prec listPrec.append(prec) print('iteration precision : %f' % prec) return precMin, precMax, mean(listPrec)
def doIt(self): lookingList = self.lookingList newsRdd = self.filenameRdd.flatMap( lambda x: _fct(lookingList, x)).filter(lambda x: x != []) MessageManager.debugMessage( "ReutersNewsSourceHDFS : stop reading Reuters corpus") return newsRdd
def lookingAt(self, symbole, startDate, endDate, keywords): upperKeywords = [x.upper() for x in keywords] MessageManager.debugMessage( "ReutersNewsSourceHDFS : start reading Reuters corpus") def hasAnyofTheresKeywords(keywords, text): for word in keywords: if (word in text): return True return False def fct(line): try: lines = line.split(',') date = datetime.datetime.strptime(lines[0], "%Y-%m-%d %H:%M:%S") if (date >= startDate and date <= endDate): head = lines[1] msg = ''.join(lines[2:]) if (hasAnyofTheresKeywords(upperKeywords, head.upper()) or hasAnyofTheresKeywords(upperKeywords, msg.upper())): #MessageManager.debugMessage("ReutersNewsSource : head or msg has keywords") return News(pubDate=date, symbole=symbole, publication=head + msg, pubSource="Reuters") except: pass # explicative line or empty return None newsRdd = self.filenameRdd.map(fct).filter(lambda x: x != None) MessageManager.debugMessage( "ReutersNewsSourceHDFS : stop reading Reuters corpus") return newsRdd
def lookingAt(self, symbole, startDate, endDate, keywords): upperKeywords = [x.upper() for x in keywords] MessageManager.debugMessage("ReutersNewsSourceHDFS : start reading Reuters corpus") def hasAnyofTheresKeywords(keywords, text): for word in keywords: if(word in text): return True return False def fct(line): try: lines = line.split(',') date = datetime.datetime.strptime(lines[0], "%Y-%m-%d %H:%M:%S") if(date >= startDate and date <= endDate): head = lines[1] msg = ''.join(lines[2:]) if(hasAnyofTheresKeywords(upperKeywords, head.upper()) or hasAnyofTheresKeywords(upperKeywords, msg.upper())): #MessageManager.debugMessage("ReutersNewsSource : head or msg has keywords") return News(pubDate=date, symbole=symbole, publication=head+msg, pubSource="Reuters") except: pass # explicative line or empty return None newsRdd = self.filenameRdd.map(fct).filter(lambda x: x != None) MessageManager.debugMessage("ReutersNewsSourceHDFS : stop reading Reuters corpus") return newsRdd
def requestForMarkets(self, symbole): params = { 'q': symbole, 'startdate': '2000-01-01', 'enddate': time.strftime('%Y-%m-%d'), 'num': 30, 'output': 'csv' } r = requests.get(self.url, params=params) MessageManager.debugMessage("GoogleFinanceMarketSource : request") return r.text.encode('utf-8').split('\n')
def addMarketStatusToNews(self, news): for new in news: self.addIfNotExist(new.symbole) enddate = new.pubDate + datetime.timedelta(days=7) isFirstLline = True new.marketStatus = [] for line in self.symboles[new.symbole]: if (isFirstLline): isFirstLline = False else: try: date_m, open_m, high_m, low_m, close_m, volume_m = line.split( ',') date_m = datetime.datetime.strptime(date_m, "%d-%b-%y") if (date_m >= new.pubDate and date_m <= enddate): MessageManager.debugMessage( "GoogleFinanceMarketSource : add marketStatus") for machin in [ date_m, open_m, high_m, low_m, close_m, volume_m ]: MessageManager.debugMessage(str(machin)) new.marketStatus.append( MarketStatus(date_m, open_m, high_m, low_m, close_m, volume_m)) MessageManager.debugMessage( "GoogleFinanceMarketSource : marketStatus added" ) except: pass # empty line MessageManager.debugMessage( "GoogleFinanceMarketSource : exception") new.marketStatus = sorted(new.marketStatus, key=lambda x: x.market_date)[:3]
def lookingAt(self, symbole, startDate, endDate, keywords): upperKeywords = [x.upper() for x in keywords] MessageManager.debugMessage( "ReutersNewsSource : start reading Reuters corpus") f = open(self.filename, 'r') for line in f: try: lines = line.split(',') date = datetime.datetime.strptime(lines[0], "%Y-%m-%d %H:%M:%S") if (date >= startDate and date <= endDate): head = lines[1] msg = ''.join(lines[2:]) if (self.hasAnyofTheresKeywords(upperKeywords, head.upper()) or self.hasAnyofTheresKeywords( upperKeywords, msg.upper())): MessageManager.debugMessage( "ReutersNewsSource : head or msg has keywords") self.news.append( News(pubDate=date, symbole=symbole, publication=head, pubSource="Reuters")) except: pass # explicative line or empty f.close() MessageManager.debugMessage( "ReutersNewsSource : stop reading Reuters corpus") MessageManager.debugMessage("ReutersNewsSource : %d news found" % len(self.news))
def classification(filepath='/media/droz/KIKOOLOL HDD/Corpus/dataset/dataset.txt', sc=None): MessageManager.debugMessage("classification : start open file %s" % filepath) lines = sc.textFile(filepath) fullDataSet = lines.map(lambda line: literal_eval(line)).map(lambda (data,label): LabeledPoint((1 if label else 0), data)).cache() MessageManager.debugMessage("classification : start split dataset") trainRdd, testRdd = fullDataSet.randomSplit([80,20], 17) dc = DataClassifier() MessageManager.debugMessage("classification : start training") dc.train(trainRdd) MessageManager.debugMessage("classification : stop training") MessageManager.debugMessage("classification : start prediction") evaluatorRdd = testRdd.map(lambda p: (p.label, dc.predict(p.features))) nbOK = evaluatorRdd.filter(lambda (a, b): a == b).count() nbTOT = testRdd.count() precision = nbOK/float(nbTOT) print('precision : %f' % precision)
def crossvalidation(self, nbSplits=5): ''' Cross validate the algorithmes for show/select the best return best algorithme name and mean precision ''' dicoPrec = {} for (classifier, name) in self.classifier: dicoPrec[name] = {'min': 1.0, 'max': 0.0, 'mean': []} aSplit = [1 for x in range(0, nbSplits)] rdds = self.dataset.randomSplit(aSplit, random.randint(1, 100000)) matrixList = [] for cpt in range(0, nbSplits): MessageManager.debugMessage( "DataClassifierEvaluator : start new cross-validation iteration %d/%d" % (cpt + 1, nbSplits)) trainSet, testSet = self._giveTrainAndtest(rdds, cpt) #one = trainSet.take(1)[0] #print('size of vect : %d' % len(one.features)) print('trainset size : %d' % trainSet.count()) print('testset size : %d' % testSet.count()) for (classifier, name) in self.classifier: currentModel = DataClassifier(None, classifier) currentModel.train(trainSet) def f(iterator): yield ((p.label, currentModel.predict(p.features)) for p in iterator) #evaluatorRdd = testSet.mapPartitions(lambda p: (p.label, currentModel.predict(p.features))) evaluatorRdd = testSet.mapPartitions(f).flatMap(lambda x: x) matrix = self._createConfusionMatrix(evaluatorRdd) matrixList.append(matrix) self._showMatrix(matrix) prec = self._results(evaluatorRdd) if (prec < dicoPrec[name]['min']): dicoPrec[name]['min'] = prec if (prec > dicoPrec[name]['max']): dicoPrec[name]['max'] = prec dicoPrec[name]['mean'].append(prec) print('=== Result of iteration ===') self.showResultConsole(dicoPrec) print('+++=== mean confusion matrix ===+++') self.matrixConfusion = self._meanMatrix(matrixList) self._showMatrix(self.matrixConfusion) print('+++=== Final Result ===+++') return self.showResultConsole(dicoPrec)
def lookingAt(self, symbole, startDate, endDate, keywords): ''' méthode pour rechercher des news -symbole ce qu'on cherche depuis startDate jusqu'à endDate avec les keywords dans le contenu ''' hasMoreQuote = True params = { 'q': symbole, 'startdate': str(startDate.strftime('%Y-%m-%d')), 'enddate': str(endDate.strftime('%Y-%m-%d')), 'start': 0, 'num': self.num } while (hasMoreQuote): r = requests.get(self.url, params=params) MessageManager.debugMessage("GoogleFinanceNewsSource : request") text = self.h.unescape(r.text).encode('utf-8') quotes = re.findall(self.expNews, text) dates = re.findall(self.expDate, text) sources = re.findall(self.expPubSource, text) if (len(quotes) < self.num): hasMoreQuote = False for cpt in xrange(len(quotes)): try: #Feb 26, 2015 date = datetime.datetime.strptime(dates[cpt], "%b %d, %Y") self.news.append( News(pubDate=date, symbole=symbole, publication=quotes[cpt], pubSource=sources[cpt])) except: MessageManager.debugMessage( "new recent, ... set for today") self.news.append( News(pubDate=datetime.datetime.now(), symbole=symbole, publication=quotes[cpt], pubSource=sources[cpt])) params['start'] += self.num
def crossvalidation(self, nbSplits=5): ''' Cross validate the algorithmes for show/select the best return best algorithme name and mean precision ''' dicoPrec = {} for (classifier, name) in self.classifier: dicoPrec[name] = {'min' : 1.0, 'max' : 0.0, 'mean' : []} aSplit = [1 for x in range(0,nbSplits)] rdds = self.dataset.randomSplit(aSplit,random.randint(1,100000)) matrixList = [] for cpt in range(0, nbSplits): MessageManager.debugMessage("DataClassifierEvaluator : start new cross-validation iteration %d/%d" % (cpt+1, nbSplits)) trainSet ,testSet = self._giveTrainAndtest(rdds, cpt) #one = trainSet.take(1)[0] #print('size of vect : %d' % len(one.features)) print('trainset size : %d' % trainSet.count()) print('testset size : %d' % testSet.count()) for (classifier, name) in self.classifier: currentModel = DataClassifier(None, classifier) currentModel.train(trainSet) def f(iterator): yield ((p.label, currentModel.predict(p.features)) for p in iterator) #evaluatorRdd = testSet.mapPartitions(lambda p: (p.label, currentModel.predict(p.features))) evaluatorRdd = testSet.mapPartitions(f).flatMap(lambda x: x) matrix = self._createConfusionMatrix(evaluatorRdd) matrixList.append(matrix) self._showMatrix(matrix) prec = self._results(evaluatorRdd) if(prec < dicoPrec[name]['min']): dicoPrec[name]['min'] = prec if(prec > dicoPrec[name]['max']): dicoPrec[name]['max'] = prec dicoPrec[name]['mean'].append(prec) print('=== Result of iteration ===') self.showResultConsole(dicoPrec) print('+++=== mean confusion matrix ===+++') self.matrixConfusion = self._meanMatrix(matrixList) self._showMatrix(self.matrixConfusion) print('+++=== Final Result ===+++') return self.showResultConsole(dicoPrec)
def lookingAt(self, symbole, startDate, endDate, keywords): hasMoreQuote=True params = {'q' : symbole, 'startdate' : str(startDate.strftime('%Y-%m-%d')), 'enddate' : str(endDate.strftime('%Y-%m-%d')), 'start' : 0, 'num' : self.num} while(hasMoreQuote): r = requests.get(self.url, params=params) MessageManager.debugMessage("GoogleFinanceNewsSource : request") text = self.h.unescape(r.text).encode('utf-8') quotes = re.findall(self.expNews, text) dates = re.findall(self.expDate, text) sources = re.findall(self.expPubSource, text) if(len(quotes) < self.num): hasMoreQuote=False for cpt in xrange(len(quotes)): try: #Feb 26, 2015 date = datetime.datetime.strptime(dates[cpt], "%b %d, %Y") self.news.append(News(pubDate=date, symbole=symbole, publication=quotes[cpt], pubSource=sources[cpt])) except: MessageManager.debugMessage("new recent, ... set for today") self.news.append(News(pubDate=datetime.datetime.now(), symbole=symbole, publication=quotes[cpt], pubSource=sources[cpt])) params['start'] += self.num
def crossvalidation(self, nbSplits=5): ''' return precision min, precision max, moyenne des precisions ''' precMin = 1.0 precMax = 0.0 prec = 0.0 listPrec = [] aSplit = [1 for x in range(0, nbSplits)] rdds = self.dataset.randomSplit(aSplit, random.randint(1, 100000)) for cpt in range(0, nbSplits): MessageManager.debugMessage( "DataClassifier : start new cross-validation iteration %d/%d" % (cpt, nbSplits)) trainSet, testSet = self._giveTrainAndtest(rdds, cpt) #one = trainSet.take(1)[0] #print('size of vect : %d' % len(one.features)) print('trainset size : %d' % trainSet.count()) print('testset size : %d' % testSet.count()) # cheat because we can't use self.predict in a map here... toto = DataClassifier(None, self.classifier) toto.train(trainSet) #TODO test here evaluatorRdd = testSet.map(lambda p: (p.label, toto.predict(p.features))) #evaluatorRdd = testSet.map(lambda p: (p.label, 1)) print('evaluatorRdd size : %d' % evaluatorRdd.count()) rddOK = evaluatorRdd.filter(lambda (a, b): a == b) nbOK = rddOK.count() nbTOT = testSet.count() prec = nbOK / float(nbTOT) if (prec < precMin): precMin = prec if (prec > precMax): precMax = prec listPrec.append(prec) print('iteration precision : %f' % prec) return precMin, precMax, mean(listPrec)
def lookingAt(self, symbole, startDate, endDate, keywords): upperKeywords = [x.upper() for x in keywords] MessageManager.debugMessage("ReutersNewsSource : start reading Reuters corpus") f = open(self.filename, 'r') for line in f: try: lines = line.split(',') date = datetime.datetime.strptime(lines[0], "%Y-%m-%d %H:%M:%S") if(date >= startDate and date <= endDate): head = lines[1] msg = ''.join(lines[2:]) if(self.hasAnyofTheresKeywords(upperKeywords, head.upper()) or self.hasAnyofTheresKeywords(upperKeywords, msg.upper())): MessageManager.debugMessage("ReutersNewsSource : head or msg has keywords") self.news.append(News(pubDate=date, symbole=symbole, publication=head, pubSource="Reuters")) except: pass # explicative line or empty f.close() MessageManager.debugMessage("ReutersNewsSource : stop reading Reuters corpus") MessageManager.debugMessage("ReutersNewsSource : %d news found" % len(self.news))
def useDataClassifier(filepath='/media/droz/KIKOOLOL HDD/Corpus/dataset/dataset.txt', sc=None): MessageManager.debugMessage("useDataClassifier : start open file %s" % filepath) lines = sc.textFile(filepath) fullDataSet = lines.map(lambda line: literal_eval(line)).map(lambda (data,label): LabeledPoint((1 if label else 0), data)) fullDataSet.cache() #fullDataSet = sc.parallelize(fullDataSet) dc = DataClassifier(fullDataSet, SVMWithSGD) MessageManager.debugMessage("useDataClassifier : start crossvalidation") precMin, precMax, prec = dc.crossvalidation(5) MessageManager.debugMessage("useDataClassifier : train full dataset") dc.train(fullDataSet) dc.saveModel() print('min : %f, max : %f, mean : %f' % (precMin, precMax, prec))
def selectBestModel(self): nameBest = '' bestPrec = 0.0 bestClassifier = None for (classifier, name) in self.classifier: MessageManager.debugMessage('DataClassifierEvaluator : Start evaluation of %s' % name) dc = DataClassifier(self.dataset, classifier) precMin, precMax, precMean = dc.crossvalidation() MessageManager.debugMessage('DataClassifierEvaluator : Results for %s : \n\tPrecMin : %f\n\tPrecMax : %f\n\tPrecMean : %f' % (name, precMin, precMax, precMean)) if(precMean > bestPrec): bestPrec = precMean nameBest = name bestClassifier = classifier MessageManager.debugMessage('DataClassifierEvaluator : best classifier is %s with precision of %f' % (nameBest, bestPrec)) return bestClassifier, nameBest
def addMarketStatusToNews(self, news): for new in news: self.addIfNotExist(new.symbole) enddate = new.pubDate + datetime.timedelta(days=7) isFirstLline=True new.marketStatus = [] for line in self.symboles[new.symbole]: if(isFirstLline): isFirstLline=False else: try: date_m,open_m,high_m,low_m,close_m,volume_m = line.split(',') date_m = datetime.datetime.strptime(date_m, "%d-%b-%y") if(date_m >= new.pubDate and date_m <= enddate): MessageManager.debugMessage("GoogleFinanceMarketSource : add marketStatus") for machin in [date_m,open_m,high_m,low_m,close_m,volume_m]: MessageManager.debugMessage(str(machin)) new.marketStatus.append(MarketStatus(date_m,open_m,high_m,low_m,close_m,volume_m)) MessageManager.debugMessage("GoogleFinanceMarketSource : marketStatus added") except: pass # empty line MessageManager.debugMessage("GoogleFinanceMarketSource : exception") new.marketStatus = sorted(new.marketStatus, key=lambda x:x.market_date)[:3]
def selectBestModel(self): nameBest = '' bestPrec = 0.0 bestClassifier = None for (classifier, name) in self.classifier: MessageManager.debugMessage( 'DataClassifierEvaluator : Start evaluation of %s' % name) dc = DataClassifier(self.dataset, classifier) precMin, precMax, precMean = dc.crossvalidation() MessageManager.debugMessage( 'DataClassifierEvaluator : Results for %s : \n\tPrecMin : %f\n\tPrecMax : %f\n\tPrecMean : %f' % (name, precMin, precMax, precMean)) if (precMean > bestPrec): bestPrec = precMean nameBest = name bestClassifier = classifier MessageManager.debugMessage( 'DataClassifierEvaluator : best classifier is %s with precision of %f' % (nameBest, bestPrec)) return bestClassifier, nameBest
def loadModel(self): MessageManager.debugMessage("DataClassifier : Load Model") self.model = pickle.load(open(self.modelpath, 'rb'))
return News(pubDate=date, symbole='NASDAQ:GOOGL', publication=txt, pubSource='Reuteurs', marketStatus=[m1,m2,m3]) if __name__ == "__main__": allNews = [createRandomNews() for x in range(30)] sc = SparkContext() newsRDD = sc.parallelize(allNews).distinct() dataSetMaker = DataSetMakerV2() fullDataSet = dataSetMaker.process(newsRDD) fullDataSet.cache() myClassifier = ClassifiersWrapper() myClassifier.addClassifier(classifier=SVMWithSGD, trainParameters={}, weight=0.3) myClassifier.addClassifier(classifier=LogisticRegressionWithSGD, trainParameters={}, weight=0.3) myClassifier.addClassifier(classifier=LogisticRegressionWithLBFGS, trainParameters={}, weight=0.3) dc = DataClassifier(fullDataSet, myClassifier) MessageManager.debugMessage("main : start crossvalidation") precMin, precMax, prec = dc.crossvalidation(5) print('min : %f, max : %f, mean : %f' % (precMin, precMax, prec)) ''' featuresRDD = newsRDD.map(lambda x: FeaturesV2(x)) allBg2 = featuresRDD.map(lambda x: list(x.bg2)).reduce(lambda a,b : a+b) allBg3 = featuresRDD.map(lambda x: list(x.bg3)).reduce(lambda a,b : a+b) setAllBg2 = set(allBg2) setAllBg3 = set(allBg3) print('size of setAllBg2 : %d' % len(setAllBg2)) print('size of setAllBg3 : %d' % len(setAllBg3)) allBg2Flat = featuresRDD.flatMap(lambda x: list(x.bg2)) allBg2FlatUnique = allBg2Flat.intersection(allBg2Flat).collect() print('size of allBg2FlatUnique %d' % len(allBg2FlatUnique)) '''
def doIt(self): lookingList = self.lookingList newsRdd = self.filenameRdd.flatMap(lambda x: _fct(lookingList, x)).filter(lambda x: x != []) MessageManager.debugMessage("ReutersNewsSourceHDFS : stop reading Reuters corpus") return newsRdd
def saveModel(self): MessageManager.debugMessage("DataClassifier : Save Model") pickle.dump(self.model, open(self.modelpath, 'wb'))
def requestForMarkets(self, symbole): params = {'q' : symbole, 'startdate' : '2000-01-01', 'enddate' : time.strftime('%Y-%m-%d'), 'num' : 30, 'output' : 'csv'} r = requests.get(self.url, params=params) MessageManager.debugMessage("GoogleFinanceMarketSource : request") return r.text.encode('utf-8').split('\n')