コード例 #1
0
ファイル: WeChatBill.py プロジェクト: Sijiu/Xbill
    def to_xbill(self) -> XBill:
        def unify_status(fund_status):
            if '支出' in fund_status:
                status = BillStatus.PAYOUT
            elif '收入' in fund_status:
                status = BillStatus.INCOME
            else:
                status = BillStatus.INTERNAL_TRANS
            return status

        def format_remarks(*args) -> str:
            s = [v for v in args if v != '/']
            return ';'.join(s)

        xbill = XBill()
        xbill.account = self.account
        xbill.amount = self.amount
        xbill.currency = '人民币'
        xbill.trans_time = self.trans_time
        xbill.status = unify_status(self.fund_status)
        xbill.trader_name = self.trader_name
        xbill.product_name = self.product_name
        xbill.remarks = format_remarks(self.pay_type, self.trans_type,
                                       self.trans_status, self.remarks)
        xbill.associate_id = -1

        xbill.status, xbill.category, xbill.subcategory = Classifier(
        ).classify(xbill)
        return xbill
コード例 #2
0
    def to_xbill(self) -> XBill:
        def unify_status(fund_status):
            if '支出' in fund_status:
                status = BillStatus.PAYOUT
            elif '收入' in fund_status:
                status = BillStatus.INCOME
            elif '资金转移' in fund_status:
                status = BillStatus.INTERNAL_TRANS
            else:
                status = fund_status
            return status

        xbill_record = XBill()
        xbill_record.account = self.account
        xbill_record.amount = self.amount + self.service_fee
        xbill_record.currency = '人民币'
        xbill_record.trans_time = self.create_time
        xbill_record.status = unify_status(self.fund_status)
        xbill_record.trader_name = self.trader_name
        xbill_record.product_name = self.product_name
        xbill_record.remarks = self.source + ';' + self.remarks
        xbill_record.associate_id = -1

        xbill_record.status, xbill_record.category, xbill_record.subcategory = Classifier(
        ).classify(xbill_record)
        return xbill_record
コード例 #3
0
    def Train(self, _Classifier, trainFeaturesExtractor, Full):
        # The serialization file to save the features
        configFileClassifier = os.path.join(
            self.basePath, "Classifier", "Configurations",
            "Configurations-" + _Classifier + ".xml")
        modelSerializationFile = os.path.join(self.basePath, "Classifier",
                                              "Output", "classifier_model.bin")

        # Start the Classifier:
        #---------------------

        self.classifier = Classifier(configFileClassifier,
                                     modelSerializationFile,
                                     trainFeaturesExtractor.features,
                                     trainFeaturesExtractor.labels, [], [])

        if Full == True:
            self.classifier.Train()
コード例 #4
0
 def to_xbill(self):
     xbill_record = XBill()
     xbill_record.account = self.account
     xbill_record.amount = self.amount
     xbill_record.currency = self.currency
     xbill_record.trans_time = self.trans_time
     xbill_record.status = self.status
     xbill_record.trader_name = self.trader_name
     xbill_record.product_name = self.summary
     xbill_record.remarks = self.product_name
     xbill_record.associate_id = -1
     xbill_record.status, xbill_record.category, xbill_record.subcategory = Classifier(
     ).classify(xbill_record)
     return xbill_record
コード例 #5
0
        if LOAD_LABELS:
            testFeaturesExtractor.LoadLabels()

    if DUMP_FEATURES:
        trainFeaturesExtractor.DumpFeaturesToTxt(trainExportFileName)
        testFeaturesExtractor.DumpFeaturesToTxt(testExportFileName)
if (CLASSIFIER):
    # The serialization file to save the features
    modelSerializationFile = ".\\Classifier\\Output\classifier_model.bin"

    # Start the Classifier:
    #---------------------
    if (SVM_CLASSIFIER):
        classifier = Classifier(modelSerializationFile, 'SVM',
                                trainFeaturesExtractor.features,
                                trainFeaturesExtractor.labels,
                                testFeaturesExtractor.features,
                                testFeaturesExtractor.labels)

        if not LOAD_MODEL:
            # Train
            classifier.Train()
            classifier.SaveModel()
        else:
            classifier.LoadModel()
        # Test
        labels, acc, val = classifier.Test()

        # Build the confusion matrix
        mConfusionMatrix, mNormalConfusionMatrix, vNumTrainExamplesPerClass, vAccuracyPerClass, nOverallAccuracy = classifier.BuildConfusionMatrix(
            testFeaturesExtractor.labels, labels)
コード例 #6
0
    def __init__(self):
        '''
        Constructor
        :type self:
        '''
        # Start the DatasetBuilder
        #-------------------------
        # Configurations file xml of the dataset builder
        configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml')
               
        # The serialization file to save the dataset
        datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin')
               
        # The XLSX file name for train set
        xlsxTrainFileName = os.path.join('DatasetBuilder','Input','train')
        
        
        # Initialize the DatasetBuilder from serialization file
        datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile)
        
        datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(xlsxTrainFileName)
                
        
        # Configurations file xml of the language model
        configFileLanguageModel_lexicon = os.path.join('LanguageModel', 'Configurations', 'Configurations-lexicon.xml')
        configFileLanguageModel_Tasi = os.path.join('LanguageModel', 'Configurations', 'Configurations-Tasi.xml')
        stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt')
        linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt')
        # The serialization file to save the model
        languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin')
        
        # Start the LanguageModel:
        
        # Initialize the LanguageModel_Lexicon
        self.languageModel_lexicon = LanguageModel(configFileLanguageModel_lexicon, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet)
        self.languageModel_lexicon.BuildLanguageModel()

         # Initialize the LanguageModel_Tasi
        self.languageModel_Tasi = LanguageModel(configFileLanguageModel_Tasi, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet)
        self.languageModel_Tasi.BuildLanguageModel()
        
        # Configurations file xml of the features extractor
        configFileFeaturesExtractor_Lexicon = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml')
        configFileFeaturesExtractor_Tasi = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml')
        # The serialization file to save the features
        trainFeaturesSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_features.bin')
        trainLabelsSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_labels.bin')
        
        # Start the FeaturesExtractor:
        #-----------------------------    
        # Initialize the FeaturesExtractor _ Lexicon
        trainFeaturesExtractor_Lexicon = FeaturesExtractor(configFileFeaturesExtractor_Lexicon, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_lexicon, datasetBuilder.trainSet)
        trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures()

        # Initialize the FeaturesExtractor _ Tasi
        trainFeaturesExtractor_Tasi = FeaturesExtractor(configFileFeaturesExtractor_Tasi, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_Tasi, datasetBuilder.trainSet)
        trainFeaturesExtractor_Tasi.ExtractNumTfFeatures()

        # The serialization file to save the features
        configFileClassifier_Lexicon = os.path.join('Classifier', 'Configurations', 'Configurations-lexicon.xml')
        configFileClassifier_Tasi = os.path.join('Classifier', 'Configurations', 'Configurations-Tasi.xml')
        modelSerializationFile = os.path.join('Classifier', 'Output', 'classifier_model.bin')
    
        # Start the Classifier:
        #---------------------
        print(trainFeaturesExtractor_Tasi.labels[:4])
        print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]])
        self.classifier_Lexicon = Classifier(configFileClassifier_Lexicon, modelSerializationFile,  trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], [])
        self.classifier_Tasi = Classifier(configFileClassifier_Tasi, modelSerializationFile, trainFeaturesExtractor_Tasi.features,
                        trainFeaturesExtractor_Tasi.labels, [],[])
        
        # Train
        self.classifier_Lexicon.Train()
        self.classifier_Tasi.Train()
コード例 #7
0
class Filter(object):
    '''
    classdocs
    '''


    def __init__(self):
        '''
        Constructor
        :type self:
        '''
        # Start the DatasetBuilder
        #-------------------------
        # Configurations file xml of the dataset builder
        configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml')
               
        # The serialization file to save the dataset
        datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin')
               
        # The XLSX file name for train set
        xlsxTrainFileName = os.path.join('DatasetBuilder','Input','train')
        
        
        # Initialize the DatasetBuilder from serialization file
        datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile)
        
        datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(xlsxTrainFileName)
                
        
        # Configurations file xml of the language model
        configFileLanguageModel_lexicon = os.path.join('LanguageModel', 'Configurations', 'Configurations-lexicon.xml')
        configFileLanguageModel_Tasi = os.path.join('LanguageModel', 'Configurations', 'Configurations-Tasi.xml')
        stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt')
        linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt')
        # The serialization file to save the model
        languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin')
        
        # Start the LanguageModel:
        
        # Initialize the LanguageModel_Lexicon
        self.languageModel_lexicon = LanguageModel(configFileLanguageModel_lexicon, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet)
        self.languageModel_lexicon.BuildLanguageModel()

         # Initialize the LanguageModel_Tasi
        self.languageModel_Tasi = LanguageModel(configFileLanguageModel_Tasi, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet)
        self.languageModel_Tasi.BuildLanguageModel()
        
        # Configurations file xml of the features extractor
        configFileFeaturesExtractor_Lexicon = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml')
        configFileFeaturesExtractor_Tasi = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml')
        # The serialization file to save the features
        trainFeaturesSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_features.bin')
        trainLabelsSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_labels.bin')
        
        # Start the FeaturesExtractor:
        #-----------------------------    
        # Initialize the FeaturesExtractor _ Lexicon
        trainFeaturesExtractor_Lexicon = FeaturesExtractor(configFileFeaturesExtractor_Lexicon, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_lexicon, datasetBuilder.trainSet)
        trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures()

        # Initialize the FeaturesExtractor _ Tasi
        trainFeaturesExtractor_Tasi = FeaturesExtractor(configFileFeaturesExtractor_Tasi, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_Tasi, datasetBuilder.trainSet)
        trainFeaturesExtractor_Tasi.ExtractNumTfFeatures()

        # The serialization file to save the features
        configFileClassifier_Lexicon = os.path.join('Classifier', 'Configurations', 'Configurations-lexicon.xml')
        configFileClassifier_Tasi = os.path.join('Classifier', 'Configurations', 'Configurations-Tasi.xml')
        modelSerializationFile = os.path.join('Classifier', 'Output', 'classifier_model.bin')
    
        # Start the Classifier:
        #---------------------
        print(trainFeaturesExtractor_Tasi.labels[:4])
        print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]])
        self.classifier_Lexicon = Classifier(configFileClassifier_Lexicon, modelSerializationFile,  trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], [])
        self.classifier_Tasi = Classifier(configFileClassifier_Tasi, modelSerializationFile, trainFeaturesExtractor_Tasi.features,
                        trainFeaturesExtractor_Tasi.labels, [],[])
        
        # Train
        self.classifier_Lexicon.Train()
        self.classifier_Tasi.Train()
        
    def Classify(self, text, stockName):
	
        testSet = []
        for t in text:
            testSet.append({'label' : '', 'text' : t})

        if stockName == 'Tasi':
            # Configurations file xml of the features extractor
            configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml')
            testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, None, None, self.languageModel_Tasi, testSet)
            testFeaturesExtractor.ExtractNumTfFeatures()
            self.classifier_Tasi.testFeatures = testFeaturesExtractor.features
            self.classifier_Tasi.testTargets = []
            for i in range(len(self.classifier_Tasi.testFeatures)):		
            	#self.classifier_Tasi.testTargets[i] = 1
                self.classifier_Tasi.testTargets.append(1)
            label, acc, val = self.classifier_Tasi.Test()
        else:
            configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml')
            testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, None, None, self.languageModel_lexicon, testSet)
            self.classifier_Lexicon.testFeatures = testFeaturesExtractor.features
            self.classifier_Lexicon.testTargets = []
            for i in range(len(self.classifier_Lexicon.testFeatures)):		
                self.classifier_Lexicon.testTargets[i] = 1
            label, acc, val = self.classifier_Lexicon.Test()

        
        return label
コード例 #8
0
class Filter(object):
    '''
    classdocs
    '''
    def __init__(self, basePath, stockName, Retrain):
        '''
        Constructor
        :type self:
        '''

        if (basePath == None):
            self.basePath = self.basePath
        else:
            self.basePath = basePath

        self.stockName = stockName
        serializationFile = open(
            os.path.join(self.basePath, 'StockToClassifier.bin'), 'rb')
        self.StockToClassifier = pickle.load(serializationFile)
        #import pdb; pdb.set_trace()
        self.usedClassifier = self.StockToClassifier[self.stockName]
        # Start the DatasetBuilder
        #-------------------------
        # Configurations file xml of the dataset builder
        configFileDatasetBuilder = os.path.join(self.basePath,
                                                "DatasetBuilder",
                                                "Configurations",
                                                "Configurations.xml")

        # The serialization file to save the dataset
        datasetSerializationFile = os.path.join(self.basePath,
                                                "DatasetBuilder", "Output",
                                                "dataset.bin")

        if Retrain == False:
            # The XLSX file name for train set
            xlsxTrainFileName = os.path.join(self.basePath, "DatasetBuilder",
                                             "Input", "train")

            # Initialize the DatasetBuilder from serialization file
            datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [],
                                            datasetSerializationFile)

            datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(
                xlsxTrainFileName)

            self.RunLanguageModel(self.usedClassifier, datasetBuilder.trainSet)

            trainFeaturesExtractor = self.RunFeatureExtractor(
                self.usedClassifier, datasetBuilder.trainSet)
            self.Train(self.usedClassifier, trainFeaturesExtractor, True)
        else:
            # Initialize the DatasetBuilder from serialization file
            datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [],
                                            datasetSerializationFile)

    def Classify(self, text):

        testSet = []
        for t in text:
            testSet.append({'label': '', 'text': t})

        # Configurations file xml of the features extractor
        configFileFeaturesExtractor = os.path.join(
            self.basePath, "FeaturesExtractor", "Configurations",
            "Configurations-" + self.usedClassifier + ".xml")
        testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor,
                                                  None, None,
                                                  self.languageModel, testSet)
        if self.usedClassifier == "Lexicon":
            testFeaturesExtractor.ExtractLexiconFeatures()
        else:
            testFeaturesExtractor.ExtractNumTfFeatures()
        self.classifier.testFeatures = testFeaturesExtractor.features
        self.classifier.testTargets = []
        for i in range(len(self.classifier.testFeatures)):
            self.classifier.testTargets.append(1)
        label, acc, val = self.classifier.Test()

        return label

    def RunLanguageModel(self, _Classifier, trainSet):
        # Configurations file xml of the language model
        configFileLanguageModel = os.path.join(
            self.basePath, "LanguageModel", "Configurations",
            "Configurations-" + _Classifier + ".xml")
        stopWordsFileName = os.path.join(self.basePath, "LanguageModel",
                                         "Input", "stop_words.txt")
        linksDBFile = os.path.join(self.basePath, "LanguageModel", "Output",
                                   "links_database.txt")
        # The serialization file to save the model
        languageModelSerializationFile = os.path.join(self.basePath,
                                                      "LanguageModel",
                                                      "Output",
                                                      "language_model.bin")
        if _Classifier == "Lexicon":
            langModelTxtLoadFile = os.path.join(
                self.basePath, "LanguageModel", "Input",
                "language_model_lexicon_synonyms.txt")

        # Start the LanguageModel:

        # Initialize the LanguageModel_Lexicon
        self.languageModel = LanguageModel(configFileLanguageModel,
                                           stopWordsFileName,
                                           languageModelSerializationFile,
                                           linksDBFile, trainSet)
        self.languageModel.BuildLanguageModel()
        if _Classifier == "Lexicon":
            self.languageModel.LoadModelFromTxtFile(langModelTxtLoadFile)

    def RunFeatureExtractor(self, _Classifier, trainSet):
        # Configurations file xml of the features extractor
        configFileFeaturesExtractor = os.path.join(
            self.basePath, "FeaturesExtractor", "Configurations",
            "Configurations-" + _Classifier + ".xml")
        # The serialization file to save the features
        trainFeaturesSerializationFile = os.path.join(self.basePath,
                                                      "FeaturesExtractor",
                                                      "Output",
                                                      "train_features.bin")
        trainLabelsSerializationFile = os.path.join(self.basePath,
                                                    "FeaturesExtractor",
                                                    "Output",
                                                    "train_labels.bin")

        # Start the FeaturesExtractor:
        #-----------------------------
        # Initialize the FeaturesExtractor
        trainFeaturesExtractor = FeaturesExtractor(
            configFileFeaturesExtractor, trainFeaturesSerializationFile,
            trainLabelsSerializationFile, self.languageModel, trainSet)
        if _Classifier == "Lexicon":
            trainFeaturesExtractor.ExtractLexiconFeatures()
        else:
            trainFeaturesExtractor.ExtractNumTfFeatures()
        return trainFeaturesExtractor

    def Train(self, _Classifier, trainFeaturesExtractor, Full):
        # The serialization file to save the features
        configFileClassifier = os.path.join(
            self.basePath, "Classifier", "Configurations",
            "Configurations-" + _Classifier + ".xml")
        modelSerializationFile = os.path.join(self.basePath, "Classifier",
                                              "Output", "classifier_model.bin")

        # Start the Classifier:
        #---------------------

        self.classifier = Classifier(configFileClassifier,
                                     modelSerializationFile,
                                     trainFeaturesExtractor.features,
                                     trainFeaturesExtractor.labels, [], [])

        if Full == True:
            self.classifier.Train()

    def GetBestClassifier(self, trainSet):
        #import pdb; pdb.set_trace()
        self.RunLanguageModel("Lexicon", trainSet)
        trainFeaturesExtractor = self.RunFeatureExtractor("Lexicon", trainSet)
        self.Train("Lexicon", trainFeaturesExtractor, False)
        LexiconAcc = self.classifier.getCrossValidationAccuarcy()

        self.RunLanguageModel("DT", trainSet)
        trainFeaturesExtractor = self.RunFeatureExtractor("DT", trainSet)
        self.Train("DT", trainFeaturesExtractor, False)
        DTAcc = self.classifier.getCrossValidationAccuarcy()

        self.RunLanguageModel("SVM", trainSet)
        trainFeaturesExtractor = self.RunFeatureExtractor("SVM", trainSet)
        self.Train("SVM", trainFeaturesExtractor, False)
        SVMAcc = self.classifier.getCrossValidationAccuarcy()
        bestClassifier = max(LexiconAcc, DTAcc, SVMAcc)
        if bestClassifier == LexiconAcc:
            self.StockToClassifier[self.stockName] = "Lexicon"
        elif bestClassifier == DTAcc:
            self.StockToClassifier[self.stockName] = "DT"
        else:
            self.StockToClassifier[self.stockName] = "SVM"
コード例 #9
0
testFeatures, testLabels = featuresExtractor.ExtractCollectiveLexiconSentimentFeatures(
    testSet)
'''
# This part is extracting the BoW features scoring for the 3 scores per each word
BoW = featuresExtractor.ConstructBowWithSentiWordNet(dataSet)
'''
'''
trainFeatures, trainLabels = featuresExtractor.ExtractBoWSentiWordNetFeatures(trainSet, BoW)
testFeatures, testLabels = featuresExtractor.ExtractBoWSentiWordNetFeatures(testSet, BoW)
'''
'''
trainFeatures, trainLabels = featuresExtractor.ExtractBoWSentiWordNetCollectiveFeatures(trainSet, BoW)
testFeatures, testLabels = featuresExtractor.ExtractBoWSentiWordNetCollectiveFeatures(testSet, BoW)
'''
# Initialize Classifier
#######################
classifier = Classifier()

# This part is for lexicon classifier. No training required

print('Training accuracy = ' +
      str(classifier.LexiconTest(trainFeatures, trainLabels)))
print('Test accuracy = ' +
      str(classifier.LexiconTest(testFeatures, testLabels)))

train_err = classifier.NNTrain(trainFeatures, trainLabels, plot=True)
test_err = classifier.NNTest(testFeatures, testLabels, plot=True)
print('Training accuracy = ' +
      str((len(trainFeatures) - train_err) / len(trainFeatures)))
print('Test accuracy = ' +
      str((len(testFeatures) - test_err) / len(testFeatures)))
コード例 #10
0
    def init(cls, save_path, use_backend=True, pre_stocks=None):
        '''
        Constructor
        :type self:
        '''
        global stocks
        if pre_stocks:
            stocks = pre_stocks

        for stock in stocks:
            print('Buildind model for %s' % stock)
            stock_model = {}
            # Start the DatasetBuilder
            #-------------------------
            # Configurations file xml of the dataset builder
            configFileDatasetBuilder = os.path.join('DatasetBuilder',
                                                    'Configurations',
                                                    'Configurations.xml')

            # The serialization file to save the dataset
            datasetSerializationFile = os.path.join('DatasetBuilder', 'Output',
                                                    'dataset.bin')

            # The XLSX file name for train set
            xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input',
                                             'train')

            # Initialize the DatasetBuilder from serialization file
            datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [],
                                            datasetSerializationFile)
            if use_backend:
                datasetBuilder.trainSet = datasetBuilder.GetDatasetFromBackend(
                    stock)
            else:
                datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(
                    xlsxTrainFileName, stock)
            if len(datasetBuilder.trainSet) < NMIN_SET:
                print("Not enough data: ", len(datasetBuilder.trainSet))
                continue
            datasetBuilder.trainSet = datasetBuilder.trainSet[NVALID:]
            # Configurations file xml of the language model
            configFileLanguageModel_lexicon = os.path.join(
                'LanguageModel', 'Configurations', 'Configurations-Tasi.xml')
            stopWordsFileName = os.path.join('LanguageModel', 'Input',
                                             'stop_words.txt')
            linksDBFile = os.path.join('LanguageModel', 'Output',
                                       'links_database.txt')
            # The serialization file to save the model
            languageModelSerializationFile = os.path.join(
                'LanguageModel', 'Output', 'language_model.bin')

            # Start the LanguageModel:

            # Initialize the LanguageModel_Lexicon
            stock_model['languageModel_lexicon'] = LanguageModel(
                configFileLanguageModel_lexicon, stopWordsFileName,
                languageModelSerializationFile, linksDBFile,
                datasetBuilder.trainSet)
            stock_model['languageModel_lexicon'].BuildLanguageModel()

            # Configurations file xml of the features extractor
            configFileFeaturesExtractor_Lexicon = os.path.join(
                'FeaturesExtractor', 'Configurations',
                'Configurations-Tasi.xml')
            # The serialization file to save the features
            trainFeaturesSerializationFile = os.path.join(
                'FeaturesExtractor', 'Output', 'train_features.bin')
            trainLabelsSerializationFile = os.path.join(
                'FeaturesExtractor', 'Output', 'train_labels.bin')

            # Start the FeaturesExtractor:
            #-----------------------------
            # Initialize the FeaturesExtractor _ Lexicon
            trainFeaturesExtractor_Lexicon = FeaturesExtractor(
                configFileFeaturesExtractor_Lexicon,
                trainFeaturesSerializationFile, trainLabelsSerializationFile,
                stock_model['languageModel_lexicon'], datasetBuilder.trainSet)
            trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures(sparse=True)
            #print(trainFeaturesExtractor_Lexicon.features[0])
            # The serialization file to save the features
            configFileClassifier_Lexicon = os.path.join(
                'Classifier', 'Configurations', 'Configurations-Tasi.xml')
            modelSerializationFile = os.path.join('Classifier', 'Output',
                                                  'classifier_model.bin')

            # Start the Classifier:
            #---------------------
            stock_model['classifier_Lexicon'] = Classifier(
                configFileClassifier_Lexicon, modelSerializationFile,
                trainFeaturesExtractor_Lexicon.sparse_features,
                trainFeaturesExtractor_Lexicon.labels, [], [])
            #stock_model['classifier_Lexicon'] = Classifier(configFileClassifier_Lexicon, modelSerializationFile,  trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], [])
            #print(trainFeaturesExtractor_Lexicon.labels[:4])
            #print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]])
            # Train
            stock_model['classifier_Lexicon'].Train()
            stock_model['training_samples'] = len(datasetBuilder.trainSet)
            cls.save(save_path, stock, stock_model)

            print("----------------------------------------------------")
コード例 #11
0
import argparse
import pandas as pd
import time
import numpy as np
import multiprocessing as mp
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from Classifier.Classifier import Classifier, Feature

NO_PARALLELIZED_PROCESSES_DYNAMIC = 5
NO_PARALLELIZED_PROCESSES_STATIC = 5
RANDOMIZER_SEED = 1
CLF_OBJ = Classifier()
DATA_FILE_PATH = ''


def main():
    start = time.time()

    ## Get Command-line Arguments #################
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--data', default='../../data', help='')
    opts = parser.parse_args()
    ###############################################
    global DATA_FILE_PATH
    DATA_FILE_PATH = opts.data
    InitializeOutputFile()

    global RANDOMIZER_SEED
コード例 #12
0
import statistics
import time
import multiprocessing as mp

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support

from sklearn import model_selection
from scipy.sparse import coo_matrix

from Classifier.Classifier import Classifier
from Helper.DebugPrint import DebugPrint

import matplotlib.pyplot as plt
plt.style.use('ggplot')

clf = Classifier()

RANDOMIZER_SEED = 1

def main():
    start = time.time()

    ## Get Command-line Arguments #################
    parser = argparse.ArgumentParser()
    opts = parser.parse_args()
    ###############################################

    ## Build the Training Set and Testing Set #####
    training_data_dict = LoadDataSet('data/train_data.xlsx')
    testing_data_dict = LoadDataSet('data/test_data.xlsx')
    unlabeled_data_dict = LoadDataSet('data/unlabeled_data.xlsx')
コード例 #13
0
#testFeaturesExtractor.ExtractTFIDFFeatures()
#testFeaturesExtractor.ExtractNumTfFeatures()
#testFeaturesExtractor.ExtractKLFeatures()
testFeaturesExtractor.ExtractLexiconFeatures()
#testFeaturesExtractor.SaveFeatures()
#testFeaturesExtractor.SaveLabels()
#testFeaturesExtractor.DumpFeaturesToTxt(testExportFileName)

# The serialization file to save the features
configFileClassifier = ".\\Classifier\\Configurations\\Configurations.xml"
modelSerializationFile = ".\\Classifier\\Output\classifier_model.bin"

# Start the Classifier:
#---------------------

classifier = Classifier(configFileClassifier, modelSerializationFile,
                        trainFeaturesExtractor.features,
                        trainFeaturesExtractor.labels,
                        testFeaturesExtractor.features,
                        testFeaturesExtractor.labels)

# Train
#classifier.Train()

# Test
labels, acc, val = classifier.Test()

# Build the confusion matrix
mConfusionMatrix, mNormalConfusionMatrix, vNumTrainExamplesPerClass, vAccuracyPerClass, nOverallAccuracy = classifier.BuildConfusionMatrix(
    testFeaturesExtractor.labels, labels)
print(mConfusionMatrix)