def select_relevant(self): from statsmodels.discrete.discrete_model import LogitResults # load classifier model model = LogitResults.load(KeytermClassifier.MODEL_STORE_FILE) # prepare feature df X = self.keyterm_feature_df.copy() X = X.drop(['doc_url', "is_url", 'term'], axis = 1) X['intercept'] = 1 self.keyterm_feature_df['relevant_pred'] = model.predict(X) self.keyterm_feature_df.sort_values(["relevant_pred", "cvalue"], ascending=[False,False], inplace=True) # self.keyterm_feature_df.sort_values(["relevant_pred", "tf"], ascending=[False,False], inplace=True) #topk_keyterms = self.keyterm_feature_df[:self.topk]['term'].values topk_keyterms = self._top_selection() return topk_keyterms
def extract_test_keywords(train_dataset_file, test_dataset_file, retrain = False): import os from statsmodels.discrete.discrete_model import LogitResults cl = KeytermClassifier(train_dataset_file, test_dataset_file) if os.path.exists(KeytermClassifier.MODEL_STORE_FILE): if retrain: print "Preparing training set ..." cl.prepare_training_set() print "Training model ..." cl.fit() print "Evaluating model ..." cl.prepare_test_set() cl.test() print "Saving model ..." cl.model.save(KeytermClassifier.MODEL_STORE_FILE) else: print "Loading model ..." saved_model = LogitResults.load(KeytermClassifier.MODEL_STORE_FILE) cl.model = saved_model else: print "Preparing training set ..." cl.prepare_training_set() print "Training model ..." cl.fit() print "Evaluating model ..." cl.prepare_test_set() cl.test() print "Saving model ..." cl.model.save(KeytermClassifier.MODEL_STORE_FILE) print "Preparing test set ..." cl.prepare_test_set() print "Extracting keywords ..." return cl.extract_test_keywords()
def __init__(self, saved_classifier_file, topk = 10): self._classifier_file = saved_classifier_file self.topk = topk self.model = LogitResults.load(saved_classifier_file)