예제 #1
0
 def crossvalidation(self, nbSplits=5):
     '''
     return precision min, precision max, moyenne des precisions
     '''
     precMin = 1.0
     precMax = 0.0
     prec = 0.0
     listPrec = []
     aSplit = [1 for x in range(0,nbSplits)]
     rdds = self.dataset.randomSplit(aSplit,random.randint(1,100000))
     for cpt in range(0, nbSplits):
         MessageManager.debugMessage("DataClassifier : start new cross-validation iteration %d/%d" % (cpt, nbSplits))
         trainSet ,testSet = self._giveTrainAndtest(rdds, cpt)
         #one = trainSet.take(1)[0]
         #print('size of vect : %d' % len(one.features))
         print('trainset size : %d' % trainSet.count())
         print('testset size : %d' % testSet.count())
         # cheat because we can't use self.predict in a map here...
         toto = DataClassifier(None,self.classifier)            
         toto.train(trainSet)
         #TODO test here
         evaluatorRdd = testSet.map(lambda p: (p.label, toto.predict(p.features)))
         #evaluatorRdd = testSet.map(lambda p: (p.label, 1))
         print('evaluatorRdd size : %d' % evaluatorRdd.count())
         rddOK = evaluatorRdd.filter(lambda (a, b): a == b)
         nbOK = rddOK.count()
         nbTOT = testSet.count()
         prec = nbOK/float(nbTOT)
         if(prec < precMin):
             precMin = prec
         if(prec > precMax):
             precMax = prec
         listPrec.append(prec)
         print('iteration precision : %f' % prec)
     return precMin, precMax, mean(listPrec)
 def doIt(self):
     lookingList = self.lookingList
     newsRdd = self.filenameRdd.flatMap(
         lambda x: _fct(lookingList, x)).filter(lambda x: x != [])
     MessageManager.debugMessage(
         "ReutersNewsSourceHDFS : stop reading Reuters corpus")
     return newsRdd
    def lookingAt(self, symbole, startDate, endDate, keywords):
        upperKeywords = [x.upper() for x in keywords]
        MessageManager.debugMessage(
            "ReutersNewsSourceHDFS : start reading Reuters corpus")

        def hasAnyofTheresKeywords(keywords, text):
            for word in keywords:
                if (word in text):
                    return True
            return False

        def fct(line):
            try:
                lines = line.split(',')
                date = datetime.datetime.strptime(lines[0],
                                                  "%Y-%m-%d %H:%M:%S")
                if (date >= startDate and date <= endDate):
                    head = lines[1]
                    msg = ''.join(lines[2:])
                    if (hasAnyofTheresKeywords(upperKeywords, head.upper())
                            or hasAnyofTheresKeywords(upperKeywords,
                                                      msg.upper())):
                        #MessageManager.debugMessage("ReutersNewsSource : head or msg has keywords")
                        return News(pubDate=date,
                                    symbole=symbole,
                                    publication=head + msg,
                                    pubSource="Reuters")
            except:
                pass  # explicative line or empty
            return None

        newsRdd = self.filenameRdd.map(fct).filter(lambda x: x != None)
        MessageManager.debugMessage(
            "ReutersNewsSourceHDFS : stop reading Reuters corpus")
        return newsRdd
 def lookingAt(self, symbole, startDate, endDate, keywords):
     upperKeywords = [x.upper() for x in keywords]
     MessageManager.debugMessage("ReutersNewsSourceHDFS : start reading Reuters corpus")
     def hasAnyofTheresKeywords(keywords, text):
         for word in keywords:
             if(word in text):
                 return True
         return False
     
     def fct(line):
         try:
             lines = line.split(',')
             date = datetime.datetime.strptime(lines[0], "%Y-%m-%d %H:%M:%S")
             if(date >= startDate and date <= endDate):
                 head = lines[1]
                 msg = ''.join(lines[2:])
                 if(hasAnyofTheresKeywords(upperKeywords, head.upper()) or hasAnyofTheresKeywords(upperKeywords, msg.upper())):
                     #MessageManager.debugMessage("ReutersNewsSource : head or msg has keywords")
                     return News(pubDate=date, symbole=symbole, publication=head+msg, pubSource="Reuters")
         except:
             pass # explicative line or empty
         return None
         
     newsRdd = self.filenameRdd.map(fct).filter(lambda x: x != None)
     MessageManager.debugMessage("ReutersNewsSourceHDFS : stop reading Reuters corpus")
     return newsRdd
 def requestForMarkets(self, symbole):
     params = {
         'q': symbole,
         'startdate': '2000-01-01',
         'enddate': time.strftime('%Y-%m-%d'),
         'num': 30,
         'output': 'csv'
     }
     r = requests.get(self.url, params=params)
     MessageManager.debugMessage("GoogleFinanceMarketSource : request")
     return r.text.encode('utf-8').split('\n')
 def addMarketStatusToNews(self, news):
     for new in news:
         self.addIfNotExist(new.symbole)
         enddate = new.pubDate + datetime.timedelta(days=7)
         isFirstLline = True
         new.marketStatus = []
         for line in self.symboles[new.symbole]:
             if (isFirstLline):
                 isFirstLline = False
             else:
                 try:
                     date_m, open_m, high_m, low_m, close_m, volume_m = line.split(
                         ',')
                     date_m = datetime.datetime.strptime(date_m, "%d-%b-%y")
                     if (date_m >= new.pubDate and date_m <= enddate):
                         MessageManager.debugMessage(
                             "GoogleFinanceMarketSource : add marketStatus")
                         for machin in [
                                 date_m, open_m, high_m, low_m, close_m,
                                 volume_m
                         ]:
                             MessageManager.debugMessage(str(machin))
                         new.marketStatus.append(
                             MarketStatus(date_m, open_m, high_m, low_m,
                                          close_m, volume_m))
                         MessageManager.debugMessage(
                             "GoogleFinanceMarketSource : marketStatus added"
                         )
                 except:
                     pass  # empty line
                     MessageManager.debugMessage(
                         "GoogleFinanceMarketSource : exception")
         new.marketStatus = sorted(new.marketStatus,
                                   key=lambda x: x.market_date)[:3]
 def lookingAt(self, symbole, startDate, endDate, keywords):
     upperKeywords = [x.upper() for x in keywords]
     MessageManager.debugMessage(
         "ReutersNewsSource : start reading Reuters corpus")
     f = open(self.filename, 'r')
     for line in f:
         try:
             lines = line.split(',')
             date = datetime.datetime.strptime(lines[0],
                                               "%Y-%m-%d %H:%M:%S")
             if (date >= startDate and date <= endDate):
                 head = lines[1]
                 msg = ''.join(lines[2:])
                 if (self.hasAnyofTheresKeywords(upperKeywords,
                                                 head.upper())
                         or self.hasAnyofTheresKeywords(
                             upperKeywords, msg.upper())):
                     MessageManager.debugMessage(
                         "ReutersNewsSource : head or msg has keywords")
                     self.news.append(
                         News(pubDate=date,
                              symbole=symbole,
                              publication=head,
                              pubSource="Reuters"))
         except:
             pass  # explicative line or empty
     f.close()
     MessageManager.debugMessage(
         "ReutersNewsSource : stop reading Reuters corpus")
     MessageManager.debugMessage("ReutersNewsSource : %d news found" %
                                 len(self.news))
예제 #8
0
def classification(filepath='/media/droz/KIKOOLOL HDD/Corpus/dataset/dataset.txt', sc=None):
    MessageManager.debugMessage("classification : start open file %s" % filepath)
    lines = sc.textFile(filepath)
    fullDataSet = lines.map(lambda line: literal_eval(line)).map(lambda (data,label): LabeledPoint((1 if label else 0), data)).cache()
    MessageManager.debugMessage("classification : start split dataset")
    trainRdd, testRdd = fullDataSet.randomSplit([80,20], 17)
    dc = DataClassifier()
    MessageManager.debugMessage("classification : start training")
    dc.train(trainRdd)
    MessageManager.debugMessage("classification : stop training")
    MessageManager.debugMessage("classification : start prediction")
    evaluatorRdd = testRdd.map(lambda p: (p.label, dc.predict(p.features)))
    nbOK = evaluatorRdd.filter(lambda (a, b): a == b).count()
    nbTOT = testRdd.count()
    precision = nbOK/float(nbTOT)
    print('precision : %f' % precision)
예제 #9
0
    def crossvalidation(self, nbSplits=5):
        '''
        Cross validate the algorithmes for show/select the best        
        
        return best algorithme name and mean precision
        '''
        dicoPrec = {}
        for (classifier, name) in self.classifier:
            dicoPrec[name] = {'min': 1.0, 'max': 0.0, 'mean': []}
        aSplit = [1 for x in range(0, nbSplits)]
        rdds = self.dataset.randomSplit(aSplit, random.randint(1, 100000))
        matrixList = []
        for cpt in range(0, nbSplits):
            MessageManager.debugMessage(
                "DataClassifierEvaluator : start new cross-validation iteration %d/%d"
                % (cpt + 1, nbSplits))
            trainSet, testSet = self._giveTrainAndtest(rdds, cpt)
            #one = trainSet.take(1)[0]
            #print('size of vect : %d' % len(one.features))
            print('trainset size : %d' % trainSet.count())
            print('testset size : %d' % testSet.count())
            for (classifier, name) in self.classifier:
                currentModel = DataClassifier(None, classifier)
                currentModel.train(trainSet)

                def f(iterator):
                    yield ((p.label, currentModel.predict(p.features))
                           for p in iterator)

                #evaluatorRdd = testSet.mapPartitions(lambda p: (p.label, currentModel.predict(p.features)))
                evaluatorRdd = testSet.mapPartitions(f).flatMap(lambda x: x)
                matrix = self._createConfusionMatrix(evaluatorRdd)
                matrixList.append(matrix)
                self._showMatrix(matrix)
                prec = self._results(evaluatorRdd)
                if (prec < dicoPrec[name]['min']):
                    dicoPrec[name]['min'] = prec
                if (prec > dicoPrec[name]['max']):
                    dicoPrec[name]['max'] = prec
                dicoPrec[name]['mean'].append(prec)
            print('=== Result of iteration ===')
            self.showResultConsole(dicoPrec)
        print('+++=== mean confusion matrix ===+++')
        self.matrixConfusion = self._meanMatrix(matrixList)
        self._showMatrix(self.matrixConfusion)
        print('+++=== Final Result ===+++')
        return self.showResultConsole(dicoPrec)
 def lookingAt(self, symbole, startDate, endDate, keywords):
     '''
     méthode pour rechercher des news
          -symbole ce qu'on cherche depuis startDate jusqu'à endDate
          avec les keywords dans le contenu
     '''
     hasMoreQuote = True
     params = {
         'q': symbole,
         'startdate': str(startDate.strftime('%Y-%m-%d')),
         'enddate': str(endDate.strftime('%Y-%m-%d')),
         'start': 0,
         'num': self.num
     }
     while (hasMoreQuote):
         r = requests.get(self.url, params=params)
         MessageManager.debugMessage("GoogleFinanceNewsSource : request")
         text = self.h.unescape(r.text).encode('utf-8')
         quotes = re.findall(self.expNews, text)
         dates = re.findall(self.expDate, text)
         sources = re.findall(self.expPubSource, text)
         if (len(quotes) < self.num):
             hasMoreQuote = False
         for cpt in xrange(len(quotes)):
             try:
                 #Feb 26, 2015
                 date = datetime.datetime.strptime(dates[cpt], "%b %d, %Y")
                 self.news.append(
                     News(pubDate=date,
                          symbole=symbole,
                          publication=quotes[cpt],
                          pubSource=sources[cpt]))
             except:
                 MessageManager.debugMessage(
                     "new recent, ... set for today")
                 self.news.append(
                     News(pubDate=datetime.datetime.now(),
                          symbole=symbole,
                          publication=quotes[cpt],
                          pubSource=sources[cpt]))
         params['start'] += self.num
예제 #11
0
 def crossvalidation(self, nbSplits=5):
     '''
     Cross validate the algorithmes for show/select the best        
     
     return best algorithme name and mean precision
     '''
     dicoPrec = {}
     for (classifier, name) in self.classifier:
         dicoPrec[name] = {'min' : 1.0, 'max' : 0.0, 'mean' : []}
     aSplit = [1 for x in range(0,nbSplits)]
     rdds = self.dataset.randomSplit(aSplit,random.randint(1,100000))
     matrixList = []
     for cpt in range(0, nbSplits):
         MessageManager.debugMessage("DataClassifierEvaluator : start new cross-validation iteration %d/%d" % (cpt+1, nbSplits))
         trainSet ,testSet = self._giveTrainAndtest(rdds, cpt)
         #one = trainSet.take(1)[0]
         #print('size of vect : %d' % len(one.features))
         print('trainset size : %d' % trainSet.count())
         print('testset size : %d' % testSet.count())
         for (classifier, name) in self.classifier:
             currentModel = DataClassifier(None, classifier)
             currentModel.train(trainSet)
             def f(iterator): yield ((p.label, currentModel.predict(p.features)) for p in iterator)
             #evaluatorRdd = testSet.mapPartitions(lambda p: (p.label, currentModel.predict(p.features)))
             evaluatorRdd = testSet.mapPartitions(f).flatMap(lambda x: x)
             matrix = self._createConfusionMatrix(evaluatorRdd)
             matrixList.append(matrix)
             self._showMatrix(matrix)
             prec = self._results(evaluatorRdd)
             if(prec < dicoPrec[name]['min']):
                 dicoPrec[name]['min'] = prec
             if(prec > dicoPrec[name]['max']):
                 dicoPrec[name]['max'] = prec
             dicoPrec[name]['mean'].append(prec)
         print('=== Result of iteration ===')
         self.showResultConsole(dicoPrec)
     print('+++=== mean confusion matrix ===+++')
     self.matrixConfusion = self._meanMatrix(matrixList)
     self._showMatrix(self.matrixConfusion)
     print('+++=== Final Result ===+++')
     return self.showResultConsole(dicoPrec)
 def lookingAt(self, symbole, startDate, endDate, keywords):
     hasMoreQuote=True
     params = {'q' : symbole, 'startdate' : str(startDate.strftime('%Y-%m-%d')), 'enddate' : str(endDate.strftime('%Y-%m-%d')), 'start' : 0, 'num' : self.num}
     while(hasMoreQuote):
         r = requests.get(self.url, params=params)
         MessageManager.debugMessage("GoogleFinanceNewsSource : request")
         text = self.h.unescape(r.text).encode('utf-8')
         quotes = re.findall(self.expNews, text)  
         dates = re.findall(self.expDate, text)
         sources = re.findall(self.expPubSource, text)
         if(len(quotes) < self.num):
             hasMoreQuote=False
         for cpt in xrange(len(quotes)):
             try:
                 #Feb 26, 2015
                 date = datetime.datetime.strptime(dates[cpt], "%b %d, %Y")
                 self.news.append(News(pubDate=date, symbole=symbole, publication=quotes[cpt], pubSource=sources[cpt]))
             except:
                 MessageManager.debugMessage("new recent, ... set for today")
                 self.news.append(News(pubDate=datetime.datetime.now(), symbole=symbole, publication=quotes[cpt], pubSource=sources[cpt]))
         params['start'] += self.num
예제 #13
0
 def crossvalidation(self, nbSplits=5):
     '''
     return precision min, precision max, moyenne des precisions
     '''
     precMin = 1.0
     precMax = 0.0
     prec = 0.0
     listPrec = []
     aSplit = [1 for x in range(0, nbSplits)]
     rdds = self.dataset.randomSplit(aSplit, random.randint(1, 100000))
     for cpt in range(0, nbSplits):
         MessageManager.debugMessage(
             "DataClassifier : start new cross-validation iteration %d/%d" %
             (cpt, nbSplits))
         trainSet, testSet = self._giveTrainAndtest(rdds, cpt)
         #one = trainSet.take(1)[0]
         #print('size of vect : %d' % len(one.features))
         print('trainset size : %d' % trainSet.count())
         print('testset size : %d' % testSet.count())
         # cheat because we can't use self.predict in a map here...
         toto = DataClassifier(None, self.classifier)
         toto.train(trainSet)
         #TODO test here
         evaluatorRdd = testSet.map(lambda p:
                                    (p.label, toto.predict(p.features)))
         #evaluatorRdd = testSet.map(lambda p: (p.label, 1))
         print('evaluatorRdd size : %d' % evaluatorRdd.count())
         rddOK = evaluatorRdd.filter(lambda (a, b): a == b)
         nbOK = rddOK.count()
         nbTOT = testSet.count()
         prec = nbOK / float(nbTOT)
         if (prec < precMin):
             precMin = prec
         if (prec > precMax):
             precMax = prec
         listPrec.append(prec)
         print('iteration precision : %f' % prec)
     return precMin, precMax, mean(listPrec)
예제 #14
0
 def lookingAt(self, symbole, startDate, endDate, keywords):
     upperKeywords = [x.upper() for x in keywords]
     MessageManager.debugMessage("ReutersNewsSource : start reading Reuters corpus")
     f = open(self.filename, 'r')
     for line in f:
         try:
             lines = line.split(',')
             date = datetime.datetime.strptime(lines[0], "%Y-%m-%d %H:%M:%S")
             if(date >= startDate and date <= endDate):
                 head = lines[1]
                 msg = ''.join(lines[2:])
                 if(self.hasAnyofTheresKeywords(upperKeywords, head.upper()) or self.hasAnyofTheresKeywords(upperKeywords, msg.upper())):
                     MessageManager.debugMessage("ReutersNewsSource : head or msg has keywords")
                     self.news.append(News(pubDate=date, symbole=symbole, publication=head, pubSource="Reuters"))
         except:
             pass # explicative line or empty
     f.close()
     MessageManager.debugMessage("ReutersNewsSource : stop reading Reuters corpus")
     MessageManager.debugMessage("ReutersNewsSource : %d news found" % len(self.news))
예제 #15
0
def useDataClassifier(filepath='/media/droz/KIKOOLOL HDD/Corpus/dataset/dataset.txt', sc=None):
    MessageManager.debugMessage("useDataClassifier : start open file %s" % filepath)
    lines = sc.textFile(filepath)
    fullDataSet = lines.map(lambda line: literal_eval(line)).map(lambda (data,label): LabeledPoint((1 if label else 0), data))
    fullDataSet.cache()
    #fullDataSet = sc.parallelize(fullDataSet)
    dc = DataClassifier(fullDataSet, SVMWithSGD)
    MessageManager.debugMessage("useDataClassifier : start crossvalidation")
    precMin, precMax, prec = dc.crossvalidation(5)
    
    MessageManager.debugMessage("useDataClassifier : train full dataset")
    dc.train(fullDataSet)
    dc.saveModel()
    print('min : %f, max : %f, mean : %f' % (precMin, precMax, prec))
예제 #16
0
 def selectBestModel(self):
     nameBest = ''
     bestPrec = 0.0
     bestClassifier = None
     for (classifier, name) in self.classifier:
         MessageManager.debugMessage('DataClassifierEvaluator : Start evaluation of %s' % name)
         dc = DataClassifier(self.dataset, classifier)
         precMin, precMax, precMean = dc.crossvalidation()
         MessageManager.debugMessage('DataClassifierEvaluator : Results for %s : \n\tPrecMin : %f\n\tPrecMax : %f\n\tPrecMean : %f' % (name, precMin, precMax, precMean))
         if(precMean > bestPrec):
             bestPrec = precMean
             nameBest = name
             bestClassifier = classifier
     MessageManager.debugMessage('DataClassifierEvaluator : best classifier is %s with precision of %f' % (nameBest, bestPrec))
     return bestClassifier, nameBest
 def addMarketStatusToNews(self, news):
     for new in news:
         self.addIfNotExist(new.symbole)
         enddate = new.pubDate + datetime.timedelta(days=7)
         isFirstLline=True
         new.marketStatus = []
         for line in self.symboles[new.symbole]:
             if(isFirstLline):
                 isFirstLline=False
             else:
                 try:
                     date_m,open_m,high_m,low_m,close_m,volume_m = line.split(',')
                     date_m = datetime.datetime.strptime(date_m, "%d-%b-%y")
                     if(date_m >= new.pubDate and date_m <= enddate):
                         MessageManager.debugMessage("GoogleFinanceMarketSource : add marketStatus")
                         for machin in [date_m,open_m,high_m,low_m,close_m,volume_m]:
                              MessageManager.debugMessage(str(machin))
                         new.marketStatus.append(MarketStatus(date_m,open_m,high_m,low_m,close_m,volume_m))
                         MessageManager.debugMessage("GoogleFinanceMarketSource : marketStatus added")
                 except:
                     pass # empty line
                     MessageManager.debugMessage("GoogleFinanceMarketSource : exception")
         new.marketStatus = sorted(new.marketStatus, key=lambda x:x.market_date)[:3]
예제 #18
0
 def selectBestModel(self):
     nameBest = ''
     bestPrec = 0.0
     bestClassifier = None
     for (classifier, name) in self.classifier:
         MessageManager.debugMessage(
             'DataClassifierEvaluator : Start evaluation of %s' % name)
         dc = DataClassifier(self.dataset, classifier)
         precMin, precMax, precMean = dc.crossvalidation()
         MessageManager.debugMessage(
             'DataClassifierEvaluator : Results for %s : \n\tPrecMin : %f\n\tPrecMax : %f\n\tPrecMean : %f'
             % (name, precMin, precMax, precMean))
         if (precMean > bestPrec):
             bestPrec = precMean
             nameBest = name
             bestClassifier = classifier
     MessageManager.debugMessage(
         'DataClassifierEvaluator : best classifier is %s with precision of %f'
         % (nameBest, bestPrec))
     return bestClassifier, nameBest
예제 #19
0
 def loadModel(self):
     MessageManager.debugMessage("DataClassifier : Load Model")
     self.model = pickle.load(open(self.modelpath, 'rb'))
예제 #20
0
         
    return News(pubDate=date, symbole='NASDAQ:GOOGL', publication=txt, pubSource='Reuteurs', marketStatus=[m1,m2,m3])

if __name__ == "__main__":
    allNews = [createRandomNews() for x in range(30)]
    sc = SparkContext()
    newsRDD = sc.parallelize(allNews).distinct()
    dataSetMaker = DataSetMakerV2()
    fullDataSet = dataSetMaker.process(newsRDD)
    fullDataSet.cache()
    myClassifier = ClassifiersWrapper()
    myClassifier.addClassifier(classifier=SVMWithSGD, trainParameters={}, weight=0.3)
    myClassifier.addClassifier(classifier=LogisticRegressionWithSGD, trainParameters={}, weight=0.3)
    myClassifier.addClassifier(classifier=LogisticRegressionWithLBFGS, trainParameters={}, weight=0.3)
    dc = DataClassifier(fullDataSet, myClassifier)
    MessageManager.debugMessage("main : start crossvalidation")
    precMin, precMax, prec = dc.crossvalidation(5)
    print('min : %f, max : %f, mean : %f' % (precMin, precMax, prec))
    '''
    featuresRDD = newsRDD.map(lambda x: FeaturesV2(x))
    allBg2 = featuresRDD.map(lambda x: list(x.bg2)).reduce(lambda a,b : a+b)
    allBg3 = featuresRDD.map(lambda x: list(x.bg3)).reduce(lambda a,b : a+b)
    setAllBg2 = set(allBg2)
    setAllBg3 = set(allBg3)
    print('size of setAllBg2 : %d' % len(setAllBg2))
    print('size of setAllBg3 : %d' % len(setAllBg3))
    
    allBg2Flat = featuresRDD.flatMap(lambda x: list(x.bg2))
    allBg2FlatUnique = allBg2Flat.intersection(allBg2Flat).collect()
    print('size of allBg2FlatUnique %d' % len(allBg2FlatUnique))    
    '''
예제 #21
0
 def doIt(self):
     lookingList = self.lookingList
     newsRdd = self.filenameRdd.flatMap(lambda x: _fct(lookingList, x)).filter(lambda x: x != [])
     MessageManager.debugMessage("ReutersNewsSourceHDFS : stop reading Reuters corpus")
     return newsRdd
예제 #22
0
         
    return News(pubDate=date, symbole='NASDAQ:GOOGL', publication=txt, pubSource='Reuteurs', marketStatus=[m1,m2,m3])

if __name__ == "__main__":
    allNews = [createRandomNews() for x in range(30)]
    sc = SparkContext()
    newsRDD = sc.parallelize(allNews).distinct()
    dataSetMaker = DataSetMakerV2()
    fullDataSet = dataSetMaker.process(newsRDD)
    fullDataSet.cache()
    myClassifier = ClassifiersWrapper()
    myClassifier.addClassifier(classifier=SVMWithSGD, trainParameters={}, weight=0.3)
    myClassifier.addClassifier(classifier=LogisticRegressionWithSGD, trainParameters={}, weight=0.3)
    myClassifier.addClassifier(classifier=LogisticRegressionWithLBFGS, trainParameters={}, weight=0.3)
    dc = DataClassifier(fullDataSet, myClassifier)
    MessageManager.debugMessage("main : start crossvalidation")
    precMin, precMax, prec = dc.crossvalidation(5)
    print('min : %f, max : %f, mean : %f' % (precMin, precMax, prec))
    '''
    featuresRDD = newsRDD.map(lambda x: FeaturesV2(x))
    allBg2 = featuresRDD.map(lambda x: list(x.bg2)).reduce(lambda a,b : a+b)
    allBg3 = featuresRDD.map(lambda x: list(x.bg3)).reduce(lambda a,b : a+b)
    setAllBg2 = set(allBg2)
    setAllBg3 = set(allBg3)
    print('size of setAllBg2 : %d' % len(setAllBg2))
    print('size of setAllBg3 : %d' % len(setAllBg3))
    
    allBg2Flat = featuresRDD.flatMap(lambda x: list(x.bg2))
    allBg2FlatUnique = allBg2Flat.intersection(allBg2Flat).collect()
    print('size of allBg2FlatUnique %d' % len(allBg2FlatUnique))    
    '''
예제 #23
0
 def loadModel(self):
     MessageManager.debugMessage("DataClassifier : Load Model")
     self.model = pickle.load(open(self.modelpath, 'rb'))
예제 #24
0
 def saveModel(self):
     MessageManager.debugMessage("DataClassifier : Save Model")
     pickle.dump(self.model, open(self.modelpath, 'wb'))
 def requestForMarkets(self, symbole):
     params = {'q' : symbole, 'startdate' : '2000-01-01', 'enddate' : time.strftime('%Y-%m-%d'), 'num' : 30, 'output' : 'csv'}
     r = requests.get(self.url, params=params)
     MessageManager.debugMessage("GoogleFinanceMarketSource : request")
     return r.text.encode('utf-8').split('\n')
예제 #26
0
 def saveModel(self):
     MessageManager.debugMessage("DataClassifier : Save Model")
     pickle.dump(self.model, open(self.modelpath, 'wb'))