def enhance_transactions(self): # load training data self.training_data = ml.load_training_data( self.training_data, filter_training_data_by_account=self. filter_training_data_by_account, existing_entries=self.existing_entries) # train the machine learning model self._trained = False if not self.training_data: logger.warning("Cannot train the machine learning model " "because the training data is empty.") elif len(self.training_data) < 2: logger.warning( "Cannot train the machine learning model " "because the training data consists of less than two elements." ) else: self.pipeline = Pipeline([ ( 'union', FeatureUnion( transformer_list=[ ('narration', Pipeline([ ('getNarration', ml.GetNarration()), ('vect', CountVectorizer(ngram_range=(1, 3))), ])), ( 'payee', Pipeline([ # any existing payee, if one exists ('getPayee', ml.GetPayee()), ('vect', CountVectorizer(ngram_range=(1, 3))), ])), ( 'dayOfMonth', Pipeline([ ('getDayOfMonth', ml.GetDayOfMonth()), ('caster', ml.ArrayCaster() ), # need for issue with data shape ])), ], transformer_weights={ 'narration': 0.8, 'payee': 0.5, 'dayOfMonth': 0.1 })), ('svc', SVC(kernel='linear')), ]) logger.debug("About to train the machine learning model...") self.pipeline.fit(self.training_data, ml.GetPayee().transform(self.training_data)) logger.info("Finished training the machine learning model.") self._trained = True if not self._trained: logger.warning( "Cannot generate predictions or suggestions " "because there is no trained machine learning model.") return self.imported_transactions # predict payees self.transactions = self.imported_transactions if self.predict_payees: logger.debug("About to generate predictions for payees...") predicted_payees: List[str] predicted_payees = self.pipeline.predict(self.transactions) self.transactions = [ ml.add_payee_to_transaction( *t_p, overwrite=self.overwrite_existing_payees) for t_p in zip(self.transactions, predicted_payees) ] logger.debug( "Finished adding predicted payees to the transactions to be imported." ) # predict payees self.transactions = self.imported_transactions if self.predict_payees: logger.debug("About to generate predictions for payees...") predicted_payees: List[str] predicted_payees = self.pipeline.predict( self.imported_transactions) self.transactions = [ ml.add_payee_to_transaction( *t_p, overwrite=self.overwrite_existing_payees) for t_p in zip(self.imported_transactions, predicted_payees) ] logger.debug( "Finished adding predicted payees to the transactions to be imported." ) # suggest likely payees if self.suggest_payees: # get values from the SVC decision function logger.debug( "About to generate suggestions about likely payees...") decision_values = self.pipeline.decision_function( self.imported_transactions) # add a human-readable class label (i.e., payee's name) to each value, and sort by value: suggested_payees = [[ payee for _, payee in sorted(list( zip(distance_values, self.pipeline.classes_)), key=lambda x: x[0], reverse=True) ] for distance_values in decision_values] # add the suggested payees to each transaction: self.transactions = [ ml.add_suggested_payees_to_transaction(*t_p) for t_p in zip(self.transactions, suggested_payees) ] logger.debug( "Finished adding suggested payees to the transactions to be imported." ) return self.transactions
def enhance_transactions(self): # load training data self.training_data = ml.load_training_data( self.training_data, filter_training_data_by_account=self. filter_training_data_by_account, existing_entries=self.existing_entries) # convert training data to a list of TxnPostingAccounts self.converted_training_data = [ ml.TxnPostingAccount(t, p, pRef.account) for t in self.training_data for pRef in t.postings for p in t.postings if p.account != pRef.account ] # train the machine learning model self._trained = False if not self.converted_training_data: logger.warning("Cannot train the machine learning model " "because the training data is empty.") elif len(self.converted_training_data) < 2: logger.warning( "Cannot train the machine learning model " "because the training data consists of less than two elements." ) else: transformers = [] transformer_weights = {} transformers.append( ('narration', Pipeline([ ('getNarration', ml.GetNarration()), ('vect', CountVectorizer(ngram_range=(1, 3))), ]))) transformer_weights['narration'] = 0.8 transformers.append( ('account', Pipeline([ ('getReferencePostingAccount', ml.GetReferencePostingAccount()), ('vect', CountVectorizer(ngram_range=(1, 3))), ]))) transformer_weights['account'] = 0.8 distinctPayees = set( map(lambda trx: trx.txn.payee, self.converted_training_data)) if len(distinctPayees) > 1: transformers.append( ('payee', Pipeline([ ('getPayee', ml.GetPayee()), ('vect', CountVectorizer(ngram_range=(1, 3))), ]))) transformer_weights['payee'] = 0.5 transformers.append(( 'dayOfMonth', Pipeline([ ('getDayOfMonth', ml.GetDayOfMonth()), ('caster', ml.ArrayCaster()), # need for issue with data shape ]))) transformer_weights['dayOfMonth'] = 0.1 self.pipeline = Pipeline([ ('union', FeatureUnion(transformer_list=transformers, transformer_weights=transformer_weights)), ('svc', SVC(kernel='linear')), ]) logger.debug("About to train the machine learning model...") self.pipeline.fit( self.converted_training_data, ml.GetPostingAccount().transform(self.converted_training_data)) logger.info("Finished training the machine learning model.") self._trained = True if not self._trained: logger.warning( "Cannot generate predictions or suggestions " "because there is no trained machine learning model.") return self.imported_transactions # predict missing second postings self.transactions = self.imported_transactions if self.predict_second_posting: logger.debug( "About to generate predictions for missing second postings...") predicted_accounts: List[str] predicted_accounts = self.pipeline.predict( self.imported_transactions) self.transactions = [ ml.add_posting_to_transaction(*t_a) for t_a in zip(self.transactions, predicted_accounts) ] logger.debug( "Finished adding predicted accounts to the transactions to be imported." ) # suggest accounts that are likely involved in the transaction if self.suggest_accounts: # get values from the SVC decision function logger.debug( "About to generate suggestions about related accounts...") decision_values = self.pipeline.decision_function( self.imported_transactions) # add a human-readable class label (i.e., account name) to each value, and sort by value: suggestions = [[ account for _, account in sorted(list( zip(distance_values, self.pipeline.classes_)), key=lambda x: x[0], reverse=True) ] for distance_values in decision_values] # add the suggested accounts to each transaction: self.transactions = [ ml.add_suggested_accounts_to_transaction(*t_s) for t_s in zip(self.transactions, suggestions) ] logger.debug( "Finished adding suggested accounts to the transactions to be imported." ) return self.transactions
def test_get_payee(self): logger.info( "Running Test Case: {id}".format(id=self.id().split('.')[-1])) self.assertEqual(ml.GetNarration().transform(self.test_data), ['Buying groceries', 'Coffee', 'Groceries', 'Coffee'])